howard.objects.variants

   1import csv
   2import gc
   3import gzip
   4import io
   5import multiprocessing
   6import os
   7import random
   8import re
   9import shlex
  10import sqlite3
  11import subprocess
  12from tempfile import NamedTemporaryFile, TemporaryDirectory
  13import tempfile
  14import duckdb
  15import json
  16import yaml
  17import argparse
  18import Bio.bgzf as bgzf
  19import pandas as pd
  20from pyfaidx import Fasta
  21import numpy as np
  22import vcf
  23import logging as log
  24import fastparquet as fp
  25from multiprocesspandas import applyparallel
  26
  27from howard.functions.commons import *
  28from howard.objects.database import *
  29from howard.functions.databases import *
  30from howard.functions.utils import *
  31
  32
  33class Variants:
  34
  35    def __init__(
  36        self,
  37        conn=None,
  38        input: str = None,
  39        output: str = None,
  40        config: dict = {},
  41        param: dict = {},
  42        load: bool = False,
  43    ) -> None:
  44        """
  45        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
  46        header
  47
  48        :param conn: the connection to the database
  49        :param input: the input file
  50        :param output: the output file
  51        :param config: a dictionary containing the configuration of the model
  52        :param param: a dictionary containing the parameters of the model
  53        """
  54
  55        # Init variables
  56        self.init_variables()
  57
  58        # Input
  59        self.set_input(input)
  60
  61        # Config
  62        self.set_config(config)
  63
  64        # Param
  65        self.set_param(param)
  66
  67        # Output
  68        self.set_output(output)
  69
  70        # connexion
  71        self.set_connexion(conn)
  72
  73        # Header
  74        self.set_header()
  75
  76        # Load data
  77        if load:
  78            self.load_data()
  79
  80    def set_input(self, input: str = None) -> None:
  81        """
  82        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  83        attributes in the class accordingly.
  84        
  85        :param input: The `set_input` method in the provided code snippet is used to set attributes
  86        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  87        :type input: str
  88        """
  89
  90        if input and not isinstance(input, str):
  91            try:
  92                self.input = input.name
  93            except:
  94                log.error(f"Input file '{input} in bad format")
  95                raise ValueError(f"Input file '{input} in bad format")
  96        else:
  97            self.input = input
  98
  99        # Input format
 100        if input:
 101            input_name, input_extension = os.path.splitext(self.input)
 102            self.input_name = input_name
 103            self.input_extension = input_extension
 104            self.input_format = self.input_extension.replace(".", "")
 105
 106    def set_config(self, config: dict) -> None:
 107        """
 108        The set_config function takes a config object and assigns it as the configuration object for the
 109        class.
 110        
 111        :param config: The `config` parameter in the `set_config` function is a dictionary object that
 112        contains configuration settings for the class. When you call the `set_config` function with a
 113        dictionary object as the argument, it will set that dictionary as the configuration object for
 114        the class
 115        :type config: dict
 116        """
 117
 118        self.config = config
 119
 120    def set_param(self, param: dict) -> None:
 121        """
 122        This function sets a parameter object for the class based on the input dictionary.
 123        
 124        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
 125        as the `param` attribute of the class instance
 126        :type param: dict
 127        """
 128
 129        self.param = param
 130
 131    def init_variables(self) -> None:
 132        """
 133        This function initializes the variables that will be used in the rest of the class
 134        """
 135
 136        self.prefix = "howard"
 137        self.table_variants = "variants"
 138        self.dataframe = None
 139
 140        self.comparison_map = {
 141            "gt": ">",
 142            "gte": ">=",
 143            "lt": "<",
 144            "lte": "<=",
 145            "equals": "=",
 146            "contains": "SIMILAR TO",
 147        }
 148
 149        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
 150
 151        self.code_type_map_to_sql = {
 152            "Integer": "INTEGER",
 153            "String": "VARCHAR",
 154            "Float": "FLOAT",
 155            "Flag": "VARCHAR",
 156        }
 157
 158        self.index_additionnal_fields = []
 159
 160    def get_indexing(self) -> bool:
 161        """
 162        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
 163        returns False.
 164        :return: The value of the indexing parameter.
 165        """
 166
 167        return self.get_param().get("indexing", False)
 168
 169    def get_connexion_config(self) -> dict:
 170        """
 171        The function `get_connexion_config` returns a dictionary containing the configuration for a
 172        connection, including the number of threads and memory limit.
 173        :return: a dictionary containing the configuration for the Connexion library.
 174        """
 175
 176        # config
 177        config = self.get_config()
 178
 179        # Connexion config
 180        connexion_config = {}
 181        threads = self.get_threads()
 182
 183        # Threads
 184        if threads:
 185            connexion_config["threads"] = threads
 186
 187        # Memory
 188        # if config.get("memory", None):
 189        #     connexion_config["memory_limit"] = config.get("memory")
 190        if self.get_memory():
 191            connexion_config["memory_limit"] = self.get_memory()
 192
 193        # Temporary directory
 194        if config.get("tmp", None):
 195            connexion_config["temp_directory"] = config.get("tmp")
 196
 197        # Access
 198        if config.get("access", None):
 199            access = config.get("access")
 200            if access in ["RO"]:
 201                access = "READ_ONLY"
 202            elif access in ["RW"]:
 203                access = "READ_WRITE"
 204            connexion_db = self.get_connexion_db()
 205            if connexion_db in ":memory:":
 206                access = "READ_WRITE"
 207            connexion_config["access_mode"] = access
 208
 209        return connexion_config
 210
 211    def get_duckdb_settings(self) -> dict:
 212        """
 213        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
 214        string.
 215        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
 216        """
 217
 218        # config
 219        config = self.get_config()
 220
 221        # duckdb settings
 222        duckdb_settings_dict = {}
 223        if config.get("duckdb_settings", None):
 224            duckdb_settings = config.get("duckdb_settings")
 225            duckdb_settings = full_path(duckdb_settings)
 226            # duckdb setting is a file
 227            if os.path.exists(duckdb_settings):
 228                with open(duckdb_settings) as json_file:
 229                    duckdb_settings_dict = yaml.safe_load(json_file)
 230            # duckdb settings is a string
 231            else:
 232                duckdb_settings_dict = json.loads(duckdb_settings)
 233
 234        return duckdb_settings_dict
 235
 236    def set_connexion_db(self) -> str:
 237        """
 238        The function `set_connexion_db` returns the appropriate database connection string based on the
 239        input format and connection type.
 240        :return: the value of the variable `connexion_db`.
 241        """
 242
 243        # Default connexion db
 244        default_connexion_db = ":memory:"
 245
 246        # Find connexion db
 247        if self.get_input_format() in ["db", "duckdb"]:
 248            connexion_db = self.get_input()
 249        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
 250            connexion_db = default_connexion_db
 251        elif self.get_connexion_type() in ["tmpfile"]:
 252            tmp_name = tempfile.mkdtemp(
 253                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
 254            )
 255            connexion_db = f"{tmp_name}/tmp.db"
 256        elif self.get_connexion_type() != "":
 257            connexion_db = self.get_connexion_type()
 258        else:
 259            connexion_db = default_connexion_db
 260
 261        # Set connexion db
 262        self.connexion_db = connexion_db
 263
 264        return connexion_db
 265
 266    def set_connexion(self, conn) -> None:
 267        """
 268        The function `set_connexion` creates a connection to a database, with options for different
 269        database formats and settings.
 270        
 271        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
 272        database. If a connection is not provided, a new connection to an in-memory database is created.
 273        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
 274        sqlite
 275        """
 276
 277        # Connexion db
 278        connexion_db = self.set_connexion_db()
 279
 280        # Connexion config
 281        connexion_config = self.get_connexion_config()
 282
 283        # Connexion format
 284        connexion_format = self.get_config().get("connexion_format", "duckdb")
 285        # Set connexion format
 286        self.connexion_format = connexion_format
 287
 288        # Connexion
 289        if not conn:
 290            if connexion_format in ["duckdb"]:
 291                conn = duckdb.connect(connexion_db, config=connexion_config)
 292                # duckDB settings
 293                duckdb_settings = self.get_duckdb_settings()
 294                if duckdb_settings:
 295                    for setting in duckdb_settings:
 296                        setting_value = duckdb_settings.get(setting)
 297                        if isinstance(setting_value, str):
 298                            setting_value = f"'{setting_value}'"
 299                        conn.execute(f"PRAGMA {setting}={setting_value};")
 300            elif connexion_format in ["sqlite"]:
 301                conn = sqlite3.connect(connexion_db)
 302
 303        # Set connexion
 304        self.conn = conn
 305
 306        # Log
 307        log.debug(f"connexion_format: {connexion_format}")
 308        log.debug(f"connexion_db: {connexion_db}")
 309        log.debug(f"connexion config: {connexion_config}")
 310        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
 311
 312    def set_output(self, output: str = None) -> None:
 313        """
 314        The `set_output` function in Python sets the output file based on the input or a specified key
 315        in the config file, extracting the output name, extension, and format.
 316        
 317        :param output: The `output` parameter in the `set_output` method is used to specify the name of
 318        the output file. If the config file has an 'output' key, the method sets the output to the value
 319        of that key. If no output is provided, it sets the output to `None`
 320        :type output: str
 321        """
 322
 323        if output and not isinstance(output, str):
 324            self.output = output.name
 325        else:
 326            self.output = output
 327
 328        # Output format
 329        if self.output:
 330            output_name, output_extension = os.path.splitext(self.output)
 331            self.output_name = output_name
 332            self.output_extension = output_extension
 333            self.output_format = self.output_extension.replace(".", "")
 334        else:
 335            self.output_name = None
 336            self.output_extension = None
 337            self.output_format = None
 338
 339    def set_header(self) -> None:
 340        """
 341        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
 342        """
 343
 344        input_file = self.get_input()
 345        default_header_list = [
 346            "##fileformat=VCFv4.2",
 347            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
 348        ]
 349
 350        # Full path
 351        input_file = full_path(input_file)
 352
 353        if input_file:
 354
 355            input_format = self.get_input_format()
 356            input_compressed = self.get_input_compressed()
 357            config = self.get_config()
 358            header_list = default_header_list
 359            if input_format in [
 360                "vcf",
 361                "hdr",
 362                "tsv",
 363                "csv",
 364                "psv",
 365                "parquet",
 366                "db",
 367                "duckdb",
 368            ]:
 369                # header provided in param
 370                if config.get("header_file", None):
 371                    with open(config.get("header_file"), "rt") as f:
 372                        header_list = self.read_vcf_header(f)
 373                # within a vcf file format (header within input file itsself)
 374                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
 375                    # within a compressed vcf file format (.vcf.gz)
 376                    if input_compressed:
 377                        with bgzf.open(input_file, "rt") as f:
 378                            header_list = self.read_vcf_header(f)
 379                    # within an uncompressed vcf file format (.vcf)
 380                    else:
 381                        with open(input_file, "rt") as f:
 382                            header_list = self.read_vcf_header(f)
 383                # header provided in default external file .hdr
 384                elif os.path.exists((input_file + ".hdr")):
 385                    with open(input_file + ".hdr", "rt") as f:
 386                        header_list = self.read_vcf_header(f)
 387                else:
 388                    try:  # Try to get header info fields and file columns
 389
 390                        with tempfile.TemporaryDirectory() as tmpdir:
 391
 392                            # Create database
 393                            db_for_header = Database(database=input_file)
 394
 395                            # Get header columns for infos fields
 396                            db_header_from_columns = (
 397                                db_for_header.get_header_from_columns()
 398                            )
 399
 400                            # Get real columns in the file
 401                            db_header_columns = db_for_header.get_columns()
 402
 403                            # Write header file
 404                            header_file_tmp = os.path.join(tmpdir, "header")
 405                            f = open(header_file_tmp, "w")
 406                            vcf.Writer(f, db_header_from_columns)
 407                            f.close()
 408
 409                            # Replace #CHROM line with rel columns
 410                            header_list = db_for_header.read_header_file(
 411                                header_file=header_file_tmp
 412                            )
 413                            header_list[-1] = "\t".join(db_header_columns)
 414
 415                    except:
 416
 417                        log.warning(
 418                            f"No header for file {input_file}. Set as default VCF header"
 419                        )
 420                        header_list = default_header_list
 421
 422            else:  # try for unknown format ?
 423
 424                log.error(f"Input file format '{input_format}' not available")
 425                raise ValueError(f"Input file format '{input_format}' not available")
 426
 427            if not header_list:
 428                header_list = default_header_list
 429
 430            # header as list
 431            self.header_list = header_list
 432
 433            # header as VCF object
 434            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
 435
 436        else:
 437
 438            self.header_list = None
 439            self.header_vcf = None
 440
 441    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
 442        """
 443        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
 444        DataFrame based on the connection format.
 445
 446        :param query: The `query` parameter in the `get_query_to_df` function is a string that
 447        represents the SQL query you want to execute. This query will be used to fetch data from a
 448        database and convert it into a pandas DataFrame
 449        :type query: str
 450        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
 451        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
 452        function will only fetch up to that number of rows from the database query result. If no limit
 453        is specified,
 454        :type limit: int
 455        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
 456        """
 457
 458        # Connexion format
 459        connexion_format = self.get_connexion_format()
 460
 461        # Limit in query
 462        if limit:
 463            pd.set_option("display.max_rows", limit)
 464            if connexion_format in ["duckdb"]:
 465                df = (
 466                    self.conn.execute(query)
 467                    .fetch_record_batch(limit)
 468                    .read_next_batch()
 469                    .to_pandas()
 470                )
 471            elif connexion_format in ["sqlite"]:
 472                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
 473
 474        # Full query
 475        else:
 476            if connexion_format in ["duckdb"]:
 477                df = self.conn.execute(query).df()
 478            elif connexion_format in ["sqlite"]:
 479                df = pd.read_sql_query(query, self.conn)
 480
 481        return df
 482
 483    def get_overview(self) -> None:
 484        """
 485        The function prints the input, output, config, and dataframe of the current object
 486        """
 487        table_variants_from = self.get_table_variants(clause="from")
 488        sql_columns = self.get_header_columns_as_sql()
 489        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
 490        df = self.get_query_to_df(sql_query_export)
 491        log.info(
 492            "Input:  "
 493            + str(self.get_input())
 494            + " ["
 495            + str(str(self.get_input_format()))
 496            + "]"
 497        )
 498        log.info(
 499            "Output: "
 500            + str(self.get_output())
 501            + " ["
 502            + str(str(self.get_output_format()))
 503            + "]"
 504        )
 505        log.info("Config: ")
 506        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
 507            "\n"
 508        ):
 509            log.info("\t" + str(d))
 510        log.info("Param: ")
 511        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
 512            "\n"
 513        ):
 514            log.info("\t" + str(d))
 515        log.info("Sample list: " + str(self.get_header_sample_list()))
 516        log.info("Dataframe: ")
 517        for d in str(df).split("\n"):
 518            log.info("\t" + str(d))
 519
 520        # garbage collector
 521        del df
 522        gc.collect()
 523
 524        return None
 525
 526    def get_stats(self) -> dict:
 527        """
 528        The `get_stats` function calculates and returns various statistics of the current object,
 529        including information about the input file, variants, samples, header fields, quality, and
 530        SNVs/InDels.
 531        :return: a dictionary containing various statistics of the current object. The dictionary has
 532        the following structure:
 533        """
 534
 535        # Log
 536        log.info(f"Stats Calculation...")
 537
 538        # table varaints
 539        table_variants_from = self.get_table_variants()
 540
 541        # stats dict
 542        stats = {"Infos": {}}
 543
 544        ### File
 545        input_file = self.get_input()
 546        stats["Infos"]["Input file"] = input_file
 547
 548        # Header
 549        header_infos = self.get_header().infos
 550        header_formats = self.get_header().formats
 551        header_infos_list = list(header_infos)
 552        header_formats_list = list(header_formats)
 553
 554        ### Variants
 555
 556        stats["Variants"] = {}
 557
 558        # Variants by chr
 559        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
 560        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
 561        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
 562            by=["CHROM"], kind="quicksort"
 563        )
 564
 565        # Total number of variants
 566        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
 567
 568        # Calculate percentage
 569        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
 570            lambda x: (x / nb_of_variants)
 571        )
 572
 573        stats["Variants"]["Number of variants by chromosome"] = (
 574            nb_of_variants_by_chrom.to_dict(orient="index")
 575        )
 576
 577        stats["Infos"]["Number of variants"] = int(nb_of_variants)
 578
 579        ### Samples
 580
 581        # Init
 582        samples = {}
 583        nb_of_samples = 0
 584
 585        # Check Samples
 586        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
 587            log.debug(f"Check samples...")
 588            for sample in self.get_header_sample_list():
 589                sql_query_samples = f"""
 590                    SELECT  '{sample}' as sample,
 591                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
 592                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
 593                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
 594                    FROM {table_variants_from}
 595                    WHERE (
 596                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
 597                        AND
 598                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
 599                      )
 600                    GROUP BY genotype
 601                    """
 602                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
 603                sample_genotype_count = sql_query_genotype_df["count"].sum()
 604                if len(sql_query_genotype_df):
 605                    nb_of_samples += 1
 606                    samples[f"{sample} - {sample_genotype_count} variants"] = (
 607                        sql_query_genotype_df.to_dict(orient="index")
 608                    )
 609
 610            stats["Samples"] = samples
 611            stats["Infos"]["Number of samples"] = nb_of_samples
 612
 613        # #
 614        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
 615        #     stats["Infos"]["Number of samples"] = nb_of_samples
 616        # elif nb_of_samples:
 617        #     stats["Infos"]["Number of samples"] = "not a VCF format"
 618
 619        ### INFO and FORMAT fields
 620        header_types_df = {}
 621        header_types_list = {
 622            "List of INFO fields": header_infos,
 623            "List of FORMAT fields": header_formats,
 624        }
 625        i = 0
 626        for header_type in header_types_list:
 627
 628            header_type_infos = header_types_list.get(header_type)
 629            header_infos_dict = {}
 630
 631            for info in header_type_infos:
 632
 633                i += 1
 634                header_infos_dict[i] = {}
 635
 636                # ID
 637                header_infos_dict[i]["id"] = info
 638
 639                # num
 640                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
 641                if header_type_infos[info].num in genotype_map.keys():
 642                    header_infos_dict[i]["Number"] = genotype_map.get(
 643                        header_type_infos[info].num
 644                    )
 645                else:
 646                    header_infos_dict[i]["Number"] = header_type_infos[info].num
 647
 648                # type
 649                if header_type_infos[info].type:
 650                    header_infos_dict[i]["Type"] = header_type_infos[info].type
 651                else:
 652                    header_infos_dict[i]["Type"] = "."
 653
 654                # desc
 655                if header_type_infos[info].desc != None:
 656                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
 657                else:
 658                    header_infos_dict[i]["Description"] = ""
 659
 660            if len(header_infos_dict):
 661                header_types_df[header_type] = pd.DataFrame.from_dict(
 662                    header_infos_dict, orient="index"
 663                ).to_dict(orient="index")
 664
 665        # Stats
 666        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
 667        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
 668        stats["Header"] = header_types_df
 669
 670        ### QUAL
 671        if "QUAL" in self.get_header_columns():
 672            sql_query_qual = f"""
 673                    SELECT
 674                        avg(CAST(QUAL AS INTEGER)) AS Average,
 675                        min(CAST(QUAL AS INTEGER)) AS Minimum,
 676                        max(CAST(QUAL AS INTEGER)) AS Maximum,
 677                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
 678                        median(CAST(QUAL AS INTEGER)) AS Median,
 679                        variance(CAST(QUAL AS INTEGER)) AS Variance
 680                    FROM {table_variants_from}
 681                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
 682                    """
 683
 684            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
 685            stats["Quality"] = {"Stats": qual}
 686
 687        ### SNV and InDel
 688
 689        sql_query_snv = f"""
 690            
 691            SELECT Type, count FROM (
 692
 693                    SELECT
 694                        'Total' AS Type,
 695                        count(*) AS count
 696                    FROM {table_variants_from}
 697
 698                    UNION
 699
 700                    SELECT
 701                        'MNV' AS Type,
 702                        count(*) AS count
 703                    FROM {table_variants_from}
 704                    WHERE len(REF) > 1 AND len(ALT) > 1
 705                    AND len(REF) = len(ALT)
 706
 707                    UNION
 708
 709                    SELECT
 710                        'InDel' AS Type,
 711                        count(*) AS count
 712                    FROM {table_variants_from}
 713                    WHERE len(REF) > 1 OR len(ALT) > 1
 714                    AND len(REF) != len(ALT)
 715                    
 716                    UNION
 717
 718                    SELECT
 719                        'SNV' AS Type,
 720                        count(*) AS count
 721                    FROM {table_variants_from}
 722                    WHERE len(REF) = 1 AND len(ALT) = 1
 723
 724                )
 725
 726            ORDER BY count DESC
 727
 728                """
 729        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
 730
 731        sql_query_snv_substitution = f"""
 732                SELECT
 733                    concat(REF, '>', ALT) AS 'Substitution',
 734                    count(*) AS count
 735                FROM {table_variants_from}
 736                WHERE len(REF) = 1 AND len(ALT) = 1
 737                GROUP BY REF, ALT
 738                ORDER BY count(*) DESC
 739                """
 740        snv_substitution = (
 741            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
 742        )
 743        stats["Variants"]["Counts"] = snv_indel
 744        stats["Variants"]["Substitutions"] = snv_substitution
 745
 746        return stats
 747
 748    def stats_to_file(self, file: str = None) -> str:
 749        """
 750        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
 751        into a JSON object, and writes the JSON object to the specified file.
 752
 753        :param file: The `file` parameter is a string that represents the file path where the JSON data
 754        will be written
 755        :type file: str
 756        :return: the name of the file that was written to.
 757        """
 758
 759        # Get stats
 760        stats = self.get_stats()
 761
 762        # Serializing json
 763        json_object = json.dumps(stats, indent=4)
 764
 765        # Writing to sample.json
 766        with open(file, "w") as outfile:
 767            outfile.write(json_object)
 768
 769        return file
 770
 771    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
 772        """
 773        The `print_stats` function generates a markdown file and prints the statistics contained in a
 774        JSON file in a formatted manner.
 775
 776        :param output_file: The `output_file` parameter is a string that specifies the path and filename
 777        of the output file where the stats will be printed in Markdown format. If no `output_file` is
 778        provided, a temporary directory will be created and the stats will be saved in a file named
 779        "stats.md" within that
 780        :type output_file: str
 781        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
 782        file where the statistics will be saved. If no value is provided, a temporary directory will be
 783        created and a default file name "stats.json" will be used
 784        :type json_file: str
 785        :return: The function `print_stats` does not return any value. It has a return type annotation
 786        of `None`.
 787        """
 788
 789        # Full path
 790        output_file = full_path(output_file)
 791        json_file = full_path(json_file)
 792
 793        with tempfile.TemporaryDirectory() as tmpdir:
 794
 795            # Files
 796            if not output_file:
 797                output_file = os.path.join(tmpdir, "stats.md")
 798            if not json_file:
 799                json_file = os.path.join(tmpdir, "stats.json")
 800
 801            # Create folders
 802            if not os.path.exists(os.path.dirname(output_file)):
 803                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
 804            if not os.path.exists(os.path.dirname(json_file)):
 805                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
 806
 807            # Create stats JSON file
 808            stats_file = self.stats_to_file(file=json_file)
 809
 810            # Print stats file
 811            with open(stats_file) as f:
 812                stats = yaml.safe_load(f)
 813
 814            # Output
 815            output_title = []
 816            output_index = []
 817            output = []
 818
 819            # Title
 820            output_title.append("# HOWARD Stats")
 821
 822            # Index
 823            output_index.append("## Index")
 824
 825            # Process sections
 826            for section in stats:
 827                infos = stats.get(section)
 828                section_link = "#" + section.lower().replace(" ", "-")
 829                output.append(f"## {section}")
 830                output_index.append(f"- [{section}]({section_link})")
 831
 832                if len(infos):
 833                    for info in infos:
 834                        try:
 835                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
 836                            is_df = True
 837                        except:
 838                            try:
 839                                df = pd.DataFrame.from_dict(
 840                                    json.loads((infos.get(info))), orient="index"
 841                                )
 842                                is_df = True
 843                            except:
 844                                is_df = False
 845                        if is_df:
 846                            output.append(f"### {info}")
 847                            info_link = "#" + info.lower().replace(" ", "-")
 848                            output_index.append(f"   - [{info}]({info_link})")
 849                            output.append(f"{df.to_markdown(index=False)}")
 850                        else:
 851                            output.append(f"- {info}: {infos.get(info)}")
 852                else:
 853                    output.append(f"NA")
 854
 855            # Write stats in markdown file
 856            with open(output_file, "w") as fp:
 857                for item in output_title:
 858                    fp.write("%s\n" % item)
 859                for item in output_index:
 860                    fp.write("%s\n" % item)
 861                for item in output:
 862                    fp.write("%s\n" % item)
 863
 864            # Output stats in markdown
 865            print("")
 866            print("\n\n".join(output_title))
 867            print("")
 868            print("\n\n".join(output))
 869            print("")
 870
 871        return None
 872
 873    def get_input(self) -> str:
 874        """
 875        It returns the value of the input variable.
 876        :return: The input is being returned.
 877        """
 878        return self.input
 879
 880    def get_input_format(self, input_file: str = None) -> str:
 881        """
 882        This function returns the format of the input variable, either from the provided input file or
 883        by prompting for input.
 884
 885        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
 886        represents the file path of the input file. If no `input_file` is provided when calling the
 887        method, it will default to `None`
 888        :type input_file: str
 889        :return: The format of the input variable is being returned.
 890        """
 891
 892        if not input_file:
 893            input_file = self.get_input()
 894        input_format = get_file_format(input_file)
 895        return input_format
 896
 897    def get_input_compressed(self, input_file: str = None) -> str:
 898        """
 899        The function `get_input_compressed` returns the format of the input variable after compressing
 900        it.
 901
 902        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
 903        that represents the file path of the input file. If no `input_file` is provided when calling the
 904        method, it will default to `None` and the method will then call `self.get_input()` to
 905        :type input_file: str
 906        :return: The function `get_input_compressed` returns the compressed format of the input
 907        variable.
 908        """
 909
 910        if not input_file:
 911            input_file = self.get_input()
 912        input_compressed = get_file_compressed(input_file)
 913        return input_compressed
 914
 915    def get_output(self) -> str:
 916        """
 917        It returns the output of the neuron.
 918        :return: The output of the neural network.
 919        """
 920
 921        return self.output
 922
 923    def get_output_format(self, output_file: str = None) -> str:
 924        """
 925        The function `get_output_format` returns the format of the input variable or the output file if
 926        provided.
 927
 928        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
 929        that represents the file path of the output file. If no `output_file` is provided when calling
 930        the method, it will default to the output obtained from the `get_output` method of the class
 931        instance. The
 932        :type output_file: str
 933        :return: The format of the input variable is being returned.
 934        """
 935
 936        if not output_file:
 937            output_file = self.get_output()
 938        output_format = get_file_format(output_file)
 939
 940        return output_format
 941
 942    def get_config(self) -> dict:
 943        """
 944        It returns the config
 945        :return: The config variable is being returned.
 946        """
 947        return self.config
 948
 949    def get_param(self) -> dict:
 950        """
 951        It returns the param
 952        :return: The param variable is being returned.
 953        """
 954        return self.param
 955
 956    def get_connexion_db(self) -> str:
 957        """
 958        It returns the connexion_db attribute of the object
 959        :return: The connexion_db is being returned.
 960        """
 961        return self.connexion_db
 962
 963    def get_prefix(self) -> str:
 964        """
 965        It returns the prefix of the object.
 966        :return: The prefix is being returned.
 967        """
 968        return self.prefix
 969
 970    def get_table_variants(self, clause: str = "select") -> str:
 971        """
 972        This function returns the table_variants attribute of the object
 973
 974        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 975        defaults to select (optional)
 976        :return: The table_variants attribute of the object.
 977        """
 978
 979        # Access
 980        access = self.get_config().get("access", None)
 981
 982        # Clauses "select", "where", "update"
 983        if clause in ["select", "where", "update"]:
 984            table_variants = self.table_variants
 985        # Clause "from"
 986        elif clause in ["from"]:
 987            # For Read Only
 988            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 989                input_file = self.get_input()
 990                table_variants = f"'{input_file}' as variants"
 991            # For Read Write
 992            else:
 993                table_variants = f"{self.table_variants} as variants"
 994        else:
 995            table_variants = self.table_variants
 996        return table_variants
 997
 998    def get_tmp_dir(self) -> str:
 999        """
1000        The function `get_tmp_dir` returns the temporary directory path based on configuration
1001        parameters or a default path.
1002        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
1003        configuration, parameters, and a default value of "/tmp".
1004        """
1005
1006        return get_tmp(
1007            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
1008        )
1009
1010    def get_connexion_type(self) -> str:
1011        """
1012        If the connexion type is not in the list of allowed connexion types, raise a ValueError
1013
1014        :return: The connexion type is being returned.
1015        """
1016        return self.get_config().get("connexion_type", "memory")
1017
1018    def get_connexion(self):
1019        """
1020        It returns the connection object
1021
1022        :return: The connection object.
1023        """
1024        return self.conn
1025
1026    def close_connexion(self) -> None:
1027        """
1028        This function closes the connection to the database.
1029        :return: The connection is being closed.
1030        """
1031        return self.conn.close()
1032
1033    def get_header(self, type: str = "vcf"):
1034        """
1035        This function returns the header of the VCF file as a list of strings
1036
1037        :param type: the type of header you want to get, defaults to vcf (optional)
1038        :return: The header of the vcf file.
1039        """
1040
1041        if self.header_vcf:
1042            if type == "vcf":
1043                return self.header_vcf
1044            elif type == "list":
1045                return self.header_list
1046        else:
1047            if type == "vcf":
1048                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
1049                return header
1050            elif type == "list":
1051                return vcf_required
1052
1053    def get_header_length(self, file: str = None) -> int:
1054        """
1055        The function `get_header_length` returns the length of the header list, excluding the #CHROM
1056        line.
1057
1058        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
1059        header file. If this argument is provided, the function will read the header from the specified
1060        file and return the length of the header list minus 1 (to exclude the #CHROM line)
1061        :type file: str
1062        :return: the length of the header list, excluding the #CHROM line.
1063        """
1064
1065        if file:
1066            return len(self.read_vcf_header_file(file=file)) - 1
1067        elif self.get_header(type="list"):
1068            return len(self.get_header(type="list")) - 1
1069        else:
1070            return 0
1071
1072    def get_header_columns(self) -> str:
1073        """
1074        This function returns the header list of a VCF
1075
1076        :return: The length of the header list.
1077        """
1078        if self.get_header():
1079            return self.get_header(type="list")[-1]
1080        else:
1081            return ""
1082
1083    def get_header_columns_as_list(self) -> list:
1084        """
1085        This function returns the header list of a VCF
1086
1087        :return: The length of the header list.
1088        """
1089        if self.get_header():
1090            return self.get_header_columns().strip().split("\t")
1091        else:
1092            return []
1093
1094    def get_header_columns_as_sql(self) -> str:
1095        """
1096        This function retruns header length (without #CHROM line)
1097
1098        :return: The length of the header list.
1099        """
1100        sql_column_list = []
1101        for col in self.get_header_columns_as_list():
1102            sql_column_list.append(f'"{col}"')
1103        return ",".join(sql_column_list)
1104
1105    def get_header_sample_list(self) -> list:
1106        """
1107        This function retruns header length (without #CHROM line)
1108
1109        :return: The length of the header list.
1110        """
1111        return self.header_vcf.samples
1112
1113    def get_verbose(self) -> bool:
1114        """
1115        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
1116        exist
1117
1118        :return: The value of the key "verbose" in the config dictionary.
1119        """
1120        return self.get_config().get("verbose", False)
1121
1122    def get_connexion_format(self) -> str:
1123        """
1124        It returns the connexion format of the object.
1125        :return: The connexion_format is being returned.
1126        """
1127        connexion_format = self.connexion_format
1128        if connexion_format not in ["duckdb", "sqlite"]:
1129            log.error(f"Unknown connexion format {connexion_format}")
1130            raise ValueError(f"Unknown connexion format {connexion_format}")
1131        else:
1132            return connexion_format
1133
1134    def insert_file_to_table(
1135        self,
1136        file,
1137        columns: str,
1138        header_len: int = 0,
1139        sep: str = "\t",
1140        chunksize: int = 1000000,
1141    ) -> None:
1142        """
1143        The function reads a file in chunks and inserts each chunk into a table based on the specified
1144        database format.
1145
1146        :param file: The `file` parameter is the file that you want to load into a table. It should be
1147        the path to the file on your system
1148        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
1149        should contain the names of the columns in the table where the data will be inserted. The column
1150        names should be separated by commas within the string. For example, if you have columns named
1151        "id", "name
1152        :type columns: str
1153        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
1154        the number of lines to skip at the beginning of the file before reading the actual data. This
1155        parameter allows you to skip any header information present in the file before processing the
1156        data, defaults to 0
1157        :type header_len: int (optional)
1158        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
1159        separator character that is used in the file being read. In this case, the default separator is
1160        set to `\t`, which represents a tab character. You can change this parameter to a different
1161        separator character if, defaults to \t
1162        :type sep: str (optional)
1163        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
1164        when processing the file in chunks. In the provided code snippet, the default value for
1165        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
1166        to 1000000
1167        :type chunksize: int (optional)
1168        """
1169
1170        # Config
1171        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
1172        connexion_format = self.get_connexion_format()
1173
1174        log.debug("chunksize: " + str(chunksize))
1175
1176        if chunksize:
1177            for chunk in pd.read_csv(
1178                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
1179            ):
1180                if connexion_format in ["duckdb"]:
1181                    sql_insert_into = (
1182                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
1183                    )
1184                    self.conn.execute(sql_insert_into)
1185                elif connexion_format in ["sqlite"]:
1186                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
1187
1188    def load_data(
1189        self,
1190        input_file: str = None,
1191        drop_variants_table: bool = False,
1192        sample_size: int = 20480,
1193    ) -> None:
1194        """
1195        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
1196        table before loading the data and specify a sample size.
1197
1198        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
1199        table
1200        :type input_file: str
1201        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
1202        determines whether the variants table should be dropped before loading the data. If set to
1203        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
1204        not be dropped, defaults to False
1205        :type drop_variants_table: bool (optional)
1206        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
1207        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
1208        20480
1209        :type sample_size: int (optional)
1210        """
1211
1212        log.info("Loading...")
1213
1214        # change input file
1215        if input_file:
1216            self.set_input(input_file)
1217            self.set_header()
1218
1219        # drop variants table
1220        if drop_variants_table:
1221            self.drop_variants_table()
1222
1223        # get table variants
1224        table_variants = self.get_table_variants()
1225
1226        # Access
1227        access = self.get_config().get("access", None)
1228        log.debug(f"access: {access}")
1229
1230        # Input format and compress
1231        input_format = self.get_input_format()
1232        input_compressed = self.get_input_compressed()
1233        log.debug(f"input_format: {input_format}")
1234        log.debug(f"input_compressed: {input_compressed}")
1235
1236        # input_compressed_format
1237        if input_compressed:
1238            input_compressed_format = "gzip"
1239        else:
1240            input_compressed_format = "none"
1241        log.debug(f"input_compressed_format: {input_compressed_format}")
1242
1243        # Connexion format
1244        connexion_format = self.get_connexion_format()
1245
1246        # Sample size
1247        if not sample_size:
1248            sample_size = -1
1249        log.debug(f"sample_size: {sample_size}")
1250
1251        # Load data
1252        log.debug(f"Load Data from {input_format}")
1253
1254        # DuckDB connexion
1255        if connexion_format in ["duckdb"]:
1256
1257            # Database already exists
1258            if self.input_format in ["db", "duckdb"]:
1259
1260                if connexion_format in ["duckdb"]:
1261                    log.debug(f"Input file format '{self.input_format}' duckDB")
1262                else:
1263                    log.error(
1264                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1265                    )
1266                    raise ValueError(
1267                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1268                    )
1269
1270            # Load from existing database format
1271            else:
1272
1273                try:
1274                    # Create Table or View
1275                    database = Database(database=self.input)
1276                    sql_from = database.get_sql_from(sample_size=sample_size)
1277
1278                    if access in ["RO"]:
1279                        sql_load = (
1280                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
1281                        )
1282                    else:
1283                        sql_load = (
1284                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
1285                        )
1286                    self.conn.execute(sql_load)
1287
1288                except:
1289                    # Format not available
1290                    log.error(f"Input file format '{self.input_format}' not available")
1291                    raise ValueError(
1292                        f"Input file format '{self.input_format}' not available"
1293                    )
1294
1295        # SQLite connexion
1296        elif connexion_format in ["sqlite"] and input_format in [
1297            "vcf",
1298            "tsv",
1299            "csv",
1300            "psv",
1301        ]:
1302
1303            # Main structure
1304            structure = {
1305                "#CHROM": "VARCHAR",
1306                "POS": "INTEGER",
1307                "ID": "VARCHAR",
1308                "REF": "VARCHAR",
1309                "ALT": "VARCHAR",
1310                "QUAL": "VARCHAR",
1311                "FILTER": "VARCHAR",
1312                "INFO": "VARCHAR",
1313            }
1314
1315            # Strcuture with samples
1316            structure_complete = structure
1317            if self.get_header_sample_list():
1318                structure["FORMAT"] = "VARCHAR"
1319                for sample in self.get_header_sample_list():
1320                    structure_complete[sample] = "VARCHAR"
1321
1322            # Columns list for create and insert
1323            sql_create_table_columns = []
1324            sql_create_table_columns_list = []
1325            for column in structure_complete:
1326                column_type = structure_complete[column]
1327                sql_create_table_columns.append(
1328                    f'"{column}" {column_type} default NULL'
1329                )
1330                sql_create_table_columns_list.append(f'"{column}"')
1331
1332            # Create database
1333            log.debug(f"Create Table {table_variants}")
1334            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
1335            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
1336            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
1337            self.conn.execute(sql_create_table)
1338
1339            # chunksize define length of file chunk load file
1340            chunksize = 100000
1341
1342            # delimiter
1343            delimiter = file_format_delimiters.get(input_format, "\t")
1344
1345            # Load the input file
1346            with open(self.input, "rt") as input_file:
1347
1348                # Use the appropriate file handler based on the input format
1349                if input_compressed:
1350                    input_file = bgzf.open(self.input, "rt")
1351                if input_format in ["vcf"]:
1352                    header_len = self.get_header_length()
1353                else:
1354                    header_len = 0
1355
1356                # Insert the file contents into a table
1357                self.insert_file_to_table(
1358                    input_file,
1359                    columns=sql_create_table_columns_list_sql,
1360                    header_len=header_len,
1361                    sep=delimiter,
1362                    chunksize=chunksize,
1363                )
1364
1365        else:
1366            log.error(
1367                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1368            )
1369            raise ValueError(
1370                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1371            )
1372
1373        # Explode INFOS fields into table fields
1374        if self.get_explode_infos():
1375            self.explode_infos(
1376                prefix=self.get_explode_infos_prefix(),
1377                fields=self.get_explode_infos_fields(),
1378                force=True,
1379            )
1380
1381        # Create index after insertion
1382        self.create_indexes()
1383
1384    def get_explode_infos(self) -> bool:
1385        """
1386        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
1387        to False if it is not set.
1388        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
1389        value. If the parameter is not present, it will return False.
1390        """
1391
1392        return self.get_param().get("explode", {}).get("explode_infos", False)
1393
1394    def get_explode_infos_fields(
1395        self,
1396        explode_infos_fields: str = None,
1397        remove_fields_not_in_header: bool = False,
1398    ) -> list:
1399        """
1400        The `get_explode_infos_fields` function returns a list of exploded information fields based on
1401        the input parameter `explode_infos_fields`.
1402
1403        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
1404        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
1405        comma-separated list of field names to explode
1406        :type explode_infos_fields: str
1407        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
1408        flag that determines whether to remove fields that are not present in the header. If it is set
1409        to `True`, any field that is not in the header will be excluded from the list of exploded
1410        information fields. If it is set to `, defaults to False
1411        :type remove_fields_not_in_header: bool (optional)
1412        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
1413        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
1414        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
1415        Otherwise, it returns a list of exploded information fields after removing any spaces and
1416        splitting the string by commas.
1417        """
1418
1419        # If no fields, get it in param
1420        if not explode_infos_fields:
1421            explode_infos_fields = (
1422                self.get_param().get("explode", {}).get("explode_infos_fields", None)
1423            )
1424
1425        # If no fields, defined as all fields in header using keyword
1426        if not explode_infos_fields:
1427            explode_infos_fields = "*"
1428
1429        # If fields list not empty
1430        if explode_infos_fields:
1431
1432            # Input fields list
1433            if isinstance(explode_infos_fields, str):
1434                fields_input = explode_infos_fields.split(",")
1435            elif isinstance(explode_infos_fields, list):
1436                fields_input = explode_infos_fields
1437            else:
1438                fields_input = []
1439
1440            # Fields list without * keyword
1441            fields_without_all = fields_input.copy()
1442            if "*".casefold() in (item.casefold() for item in fields_without_all):
1443                fields_without_all.remove("*")
1444
1445            # Fields in header
1446            fields_in_header = sorted(list(set(self.get_header().infos)))
1447
1448            # Construct list of fields
1449            fields_output = []
1450            for field in fields_input:
1451
1452                # Strip field
1453                field = field.strip()
1454
1455                # format keyword * in regex
1456                if field.upper() in ["*"]:
1457                    field = ".*"
1458
1459                # Find all fields with pattern
1460                r = re.compile(field)
1461                fields_search = sorted(list(filter(r.match, fields_in_header)))
1462
1463                # Remove fields input from search
1464                if fields_search != [field]:
1465                    fields_search = sorted(
1466                        list(set(fields_search).difference(fields_input))
1467                    )
1468
1469                # If field is not in header (avoid not well formatted header)
1470                if not fields_search and not remove_fields_not_in_header:
1471                    fields_search = [field]
1472
1473                # Add found fields
1474                for new_field in fields_search:
1475                    # Add field, if not already exists, and if it is in header (if asked)
1476                    if (
1477                        new_field not in fields_output
1478                        and (
1479                            not remove_fields_not_in_header
1480                            or new_field in fields_in_header
1481                        )
1482                        and new_field not in [".*"]
1483                    ):
1484                        fields_output.append(new_field)
1485
1486            return fields_output
1487
1488        else:
1489
1490            return []
1491
1492    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1493        """
1494        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
1495        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
1496        not provided.
1497
1498        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
1499        prefix to be used for exploding or expanding information
1500        :type explode_infos_prefix: str
1501        :return: the value of the variable `explode_infos_prefix`.
1502        """
1503
1504        if not explode_infos_prefix:
1505            explode_infos_prefix = (
1506                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
1507            )
1508
1509        return explode_infos_prefix
1510
1511    def add_column(
1512        self,
1513        table_name,
1514        column_name,
1515        column_type,
1516        default_value=None,
1517        drop: bool = False,
1518    ) -> dict:
1519        """
1520        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
1521        doesn't already exist.
1522
1523        :param table_name: The name of the table to which you want to add a column
1524        :param column_name: The parameter "column_name" is the name of the column that you want to add
1525        to the table
1526        :param column_type: The `column_type` parameter specifies the data type of the column that you
1527        want to add to the table. It should be a string that represents the desired data type, such as
1528        "INTEGER", "TEXT", "REAL", etc
1529        :param default_value: The `default_value` parameter is an optional parameter that specifies the
1530        default value for the newly added column. If a default value is provided, it will be assigned to
1531        the column for any existing rows that do not have a value for that column
1532        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
1533        if it already exists in the table. If `drop` is set to `True`, the function will drop the
1534        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
1535        to False
1536        :type drop: bool (optional)
1537        :return: a boolean value indicating whether the column was successfully added to the table.
1538        """
1539
1540        # added
1541        added = False
1542        dropped = False
1543
1544        # Check if the column already exists in the table
1545        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1546        columns = self.get_query_to_df(query).columns.tolist()
1547        if column_name in columns:
1548            log.debug(
1549                f"The {column_name} column already exists in the {table_name} table"
1550            )
1551            if drop:
1552                self.drop_column(table_name=table_name, column_name=column_name)
1553                dropped = True
1554            else:
1555                return None
1556        else:
1557            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1558
1559        # Add column in table
1560        add_column_query = (
1561            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
1562        )
1563        if default_value is not None:
1564            add_column_query += f" DEFAULT {default_value}"
1565        self.execute_query(add_column_query)
1566        added = not dropped
1567        log.debug(
1568            f"The {column_name} column was successfully added to the {table_name} table"
1569        )
1570
1571        if added:
1572            added_column = {
1573                "table_name": table_name,
1574                "column_name": column_name,
1575                "column_type": column_type,
1576                "default_value": default_value,
1577            }
1578        else:
1579            added_column = None
1580
1581        return added_column
1582
1583    def drop_column(
1584        self, column: dict = None, table_name: str = None, column_name: str = None
1585    ) -> bool:
1586        """
1587        The `drop_column` function drops a specified column from a given table in a database and returns
1588        True if the column was successfully dropped, and False if the column does not exist in the
1589        table.
1590
1591        :param column: The `column` parameter is a dictionary that contains information about the column
1592        you want to drop. It has two keys:
1593        :type column: dict
1594        :param table_name: The `table_name` parameter is the name of the table from which you want to
1595        drop a column
1596        :type table_name: str
1597        :param column_name: The `column_name` parameter is the name of the column that you want to drop
1598        from the table
1599        :type column_name: str
1600        :return: a boolean value. It returns True if the column was successfully dropped from the table,
1601        and False if the column does not exist in the table.
1602        """
1603
1604        # Find column infos
1605        if column:
1606            if isinstance(column, dict):
1607                table_name = column.get("table_name", None)
1608                column_name = column.get("column_name", None)
1609            elif isinstance(column, str):
1610                table_name = self.get_table_variants()
1611                column_name = column
1612            else:
1613                table_name = None
1614                column_name = None
1615
1616        if not table_name and not column_name:
1617            return False
1618
1619        # Removed
1620        removed = False
1621
1622        # Check if the column already exists in the table
1623        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1624        columns = self.get_query_to_df(query).columns.tolist()
1625        if column_name in columns:
1626            log.debug(f"The {column_name} column exists in the {table_name} table")
1627        else:
1628            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1629            return False
1630
1631        # Add column in table # ALTER TABLE integers DROP k
1632        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
1633        self.execute_query(add_column_query)
1634        removed = True
1635        log.debug(
1636            f"The {column_name} column was successfully dropped to the {table_name} table"
1637        )
1638
1639        return removed
1640
1641    def explode_infos(
1642        self,
1643        prefix: str = None,
1644        create_index: bool = False,
1645        fields: list = None,
1646        force: bool = False,
1647        proccess_all_fields_together: bool = False,
1648    ) -> list:
1649        """
1650        The `explode_infos` function takes a VCF file and explodes the INFO fields into individual
1651        columns, returning a list of added columns.
1652
1653        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
1654        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
1655        `self.get_explode_infos_prefix()` as the prefix
1656        :type prefix: str
1657        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
1658        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
1659        `False`, indexes will not be created. The default value is `False`, defaults to False
1660        :type create_index: bool (optional)
1661        :param fields: The `fields` parameter is a list of INFO fields that you want to explode into
1662        individual columns. If this parameter is not provided, all INFO fields will be exploded
1663        :type fields: list
1664        :param force: The `force` parameter is a boolean flag that determines whether to drop and
1665        recreate the column if it already exists in the table. If `force` is set to `True`, the column
1666        will be dropped and recreated. If `force` is set to `False`, the column will not be dropped,
1667        defaults to False
1668        :type force: bool (optional)
1669        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
1670        flag that determines whether to process all the INFO fields together or individually. If set to
1671        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
1672        be processed individually, defaults to False
1673        :type proccess_all_fields_together: bool (optional)
1674        :return: The function `explode_infos` returns a list of added columns.
1675        """
1676
1677        # drop indexes
1678        self.drop_indexes()
1679
1680        # connexion format
1681        connexion_format = self.get_connexion_format()
1682
1683        # Access
1684        access = self.get_config().get("access", None)
1685
1686        # Added columns
1687        added_columns = []
1688
1689        if access not in ["RO"]:
1690
1691            # prefix
1692            if prefix in [None, True] or not isinstance(prefix, str):
1693                if self.get_explode_infos_prefix() not in [None, True]:
1694                    prefix = self.get_explode_infos_prefix()
1695                else:
1696                    prefix = "INFO/"
1697
1698            # table variants
1699            table_variants = self.get_table_variants(clause="select")
1700
1701            # extra infos
1702            try:
1703                extra_infos = self.get_extra_infos()
1704            except:
1705                extra_infos = []
1706
1707            # Header infos
1708            header_infos = self.get_header().infos
1709
1710            log.debug(
1711                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
1712            )
1713
1714            sql_info_alter_table_array = []
1715
1716            # Info fields to check
1717            fields_list = list(header_infos)
1718            if fields:
1719                fields_list += fields
1720            fields_list = set(fields_list)
1721
1722            # If no fields
1723            if not fields:
1724                fields = []
1725
1726            # Translate fields if patterns
1727            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
1728
1729            for info in fields:
1730
1731                info_id_sql = prefix + info
1732
1733                if (
1734                    info in fields_list
1735                    or prefix + info in fields_list
1736                    or info in extra_infos
1737                ):
1738
1739                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
1740
1741                    if info in header_infos:
1742                        info_type = header_infos[info].type
1743                        info_num = header_infos[info].num
1744                    else:
1745                        info_type = "String"
1746                        info_num = 0
1747
1748                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
1749                    if info_num != 1:
1750                        type_sql = "VARCHAR"
1751
1752                    # Add field
1753                    added_column = self.add_column(
1754                        table_name=table_variants,
1755                        column_name=info_id_sql,
1756                        column_type=type_sql,
1757                        default_value="null",
1758                        drop=force,
1759                    )
1760
1761                    if added_column:
1762                        added_columns.append(added_column)
1763
1764                    if added_column or force:
1765
1766                        # add field to index
1767                        self.index_additionnal_fields.append(info_id_sql)
1768
1769                        # Update field array
1770                        if connexion_format in ["duckdb"]:
1771                            update_info_field = f"""
1772                            "{info_id_sql}" =
1773                                CASE
1774                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
1775                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
1776                                END
1777                            """
1778                        elif connexion_format in ["sqlite"]:
1779                            update_info_field = f"""
1780                                "{info_id_sql}" =
1781                                    CASE
1782                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
1783                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
1784                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
1785                                    END
1786                            """
1787
1788                        sql_info_alter_table_array.append(update_info_field)
1789
1790            if sql_info_alter_table_array:
1791
1792                # By chromosomes
1793                try:
1794                    chromosomes_list = list(
1795                        self.get_query_to_df(
1796                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
1797                        )["#CHROM"]
1798                    )
1799                except:
1800                    chromosomes_list = [None]
1801
1802                for chrom in chromosomes_list:
1803                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
1804
1805                    # Where clause
1806                    where_clause = ""
1807                    if chrom and len(chromosomes_list) > 1:
1808                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
1809
1810                    # Update table
1811                    if proccess_all_fields_together:
1812                        sql_info_alter_table_array_join = ", ".join(
1813                            sql_info_alter_table_array
1814                        )
1815                        if sql_info_alter_table_array_join:
1816                            sql_info_alter_table = f"""
1817                                UPDATE {table_variants}
1818                                SET {sql_info_alter_table_array_join}
1819                                {where_clause}
1820                                """
1821                            log.debug(
1822                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
1823                            )
1824                            # log.debug(sql_info_alter_table)
1825                            self.conn.execute(sql_info_alter_table)
1826                    else:
1827                        sql_info_alter_num = 0
1828                        for sql_info_alter in sql_info_alter_table_array:
1829                            sql_info_alter_num += 1
1830                            sql_info_alter_table = f"""
1831                                UPDATE {table_variants}
1832                                SET {sql_info_alter}
1833                                {where_clause}
1834                                """
1835                            log.debug(
1836                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
1837                            )
1838                            # log.debug(sql_info_alter_table)
1839                            self.conn.execute(sql_info_alter_table)
1840
1841        # create indexes
1842        if create_index:
1843            self.create_indexes()
1844
1845        return added_columns
1846
1847    def create_indexes(self) -> None:
1848        """
1849        Create indexes on the table after insertion
1850        """
1851
1852        # Access
1853        access = self.get_config().get("access", None)
1854
1855        # get table variants
1856        table_variants = self.get_table_variants("FROM")
1857
1858        if self.get_indexing() and access not in ["RO"]:
1859            # Create index
1860            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
1861            self.conn.execute(sql_create_table_index)
1862            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
1863            self.conn.execute(sql_create_table_index)
1864            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
1865            self.conn.execute(sql_create_table_index)
1866            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
1867            self.conn.execute(sql_create_table_index)
1868            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
1869            self.conn.execute(sql_create_table_index)
1870            for field in self.index_additionnal_fields:
1871                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
1872                self.conn.execute(sql_create_table_index)
1873
1874    def drop_indexes(self) -> None:
1875        """
1876        Create indexes on the table after insertion
1877        """
1878
1879        # Access
1880        access = self.get_config().get("access", None)
1881
1882        # get table variants
1883        table_variants = self.get_table_variants("FROM")
1884
1885        # Get database format
1886        connexion_format = self.get_connexion_format()
1887
1888        if access not in ["RO"]:
1889            if connexion_format in ["duckdb"]:
1890                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
1891            elif connexion_format in ["sqlite"]:
1892                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
1893
1894            list_indexes = self.conn.execute(sql_list_indexes)
1895            index_names = [row[0] for row in list_indexes.fetchall()]
1896            for index in index_names:
1897                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
1898                self.conn.execute(sql_drop_table_index)
1899
1900    def read_vcf_header(self, f) -> list:
1901        """
1902        It reads the header of a VCF file and returns a list of the header lines
1903
1904        :param f: the file object
1905        :return: The header lines of the VCF file.
1906        """
1907
1908        header_list = []
1909        for line in f:
1910            header_list.append(line)
1911            if line.startswith("#CHROM"):
1912                break
1913        return header_list
1914
1915    def read_vcf_header_file(self, file: str = None) -> list:
1916        """
1917        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
1918        uncompressed files.
1919
1920        :param file: The `file` parameter is a string that represents the path to the VCF header file
1921        that you want to read. It is an optional parameter, so if you don't provide a value, it will
1922        default to `None`
1923        :type file: str
1924        :return: The function `read_vcf_header_file` returns a list.
1925        """
1926
1927        if self.get_input_compressed(input_file=file):
1928            with bgzf.open(file, "rt") as f:
1929                return self.read_vcf_header(f=f)
1930        else:
1931            with open(file, "rt") as f:
1932                return self.read_vcf_header(f=f)
1933
1934    def execute_query(self, query: str):
1935        """
1936        It takes a query as an argument, executes it, and returns the results
1937
1938        :param query: The query to be executed
1939        :return: The result of the query is being returned.
1940        """
1941        if query:
1942            return self.conn.execute(query)  # .fetchall()
1943        else:
1944            return None
1945
1946    def export_output(
1947        self,
1948        output_file: str | None = None,
1949        output_header: str | None = None,
1950        export_header: bool = True,
1951        query: str | None = None,
1952        parquet_partitions: list | None = None,
1953        chunk_size: int | None = None,
1954        threads: int | None = None,
1955        sort: bool = False,
1956        index: bool = False,
1957        order_by: str | None = None,
1958    ) -> bool:
1959        """
1960        The `export_output` function exports data from a VCF file to a specified output file in various
1961        formats, including VCF, CSV, TSV, PSV, and Parquet.
1962
1963        :param output_file: The `output_file` parameter is a string that specifies the name of the
1964        output file to be generated by the function. This is where the exported data will be saved
1965        :type output_file: str
1966        :param output_header: The `output_header` parameter is a string that specifies the name of the
1967        file where the header of the VCF file will be exported. If this parameter is not provided, the
1968        header will be exported to a file with the same name as the `output_file` parameter, but with
1969        the extension "
1970        :type output_header: str
1971        :param export_header: The `export_header` parameter is a boolean flag that determines whether
1972        the header of a VCF file should be exported to a separate file or not. If `export_header` is
1973        True, the header will be exported to a file. If `export_header` is False, the header will not
1974        be, defaults to True, if output format is not VCF
1975        :type export_header: bool (optional)
1976        :param query: The `query` parameter is an optional SQL query that can be used to filter and
1977        select specific data from the VCF file before exporting it. If provided, only the data that
1978        matches the query will be exported
1979        :type query: str
1980        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
1981        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
1982        organize data in a hierarchical directory structure based on the values of one or more columns.
1983        This can improve query performance when working with large datasets
1984        :type parquet_partitions: list
1985        :param chunk_size: The `chunk_size` parameter specifies the number of
1986        records in batch when exporting data in Parquet format. This parameter is used for
1987        partitioning the Parquet file into multiple files.
1988        :type chunk_size: int
1989        :param threads: The `threads` parameter is an optional parameter that specifies the number of
1990        threads to be used during the export process. It determines the level of parallelism and can
1991        improve the performance of the export operation. If not provided, the function will use the
1992        default number of threads
1993        :type threads: int
1994        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
1995        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
1996        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
1997        False
1998        :type sort: bool (optional)
1999        :param index: The `index` parameter is a boolean flag that determines whether an index should be
2000        created on the output file. If `index` is True, an index will be created. If `index` is False,
2001        no index will be created. The default value is False, defaults to False
2002        :type index: bool (optional)
2003        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
2004        sorting the output file. This parameter is only applicable when exporting data in VCF format
2005        :type order_by: str
2006        :return: a boolean value. It checks if the output file exists and returns True if it does, or
2007        None if it doesn't.
2008        """
2009
2010        # Log
2011        log.info("Exporting...")
2012
2013        # Full path
2014        output_file = full_path(output_file)
2015        output_header = full_path(output_header)
2016
2017        # Config
2018        config = self.get_config()
2019
2020        # Param
2021        param = self.get_param()
2022
2023        # Tmp files to remove
2024        tmp_to_remove = []
2025
2026        # If no output, get it
2027        if not output_file:
2028            output_file = self.get_output()
2029
2030        # If not threads
2031        if not threads:
2032            threads = self.get_threads()
2033
2034        # Auto header name with extension
2035        if export_header or output_header:
2036            if not output_header:
2037                output_header = f"{output_file}.hdr"
2038            # Export header
2039            self.export_header(output_file=output_file)
2040
2041        # Switch off export header if VCF output
2042        output_file_type = get_file_format(output_file)
2043        if output_file_type in ["vcf"]:
2044            export_header = False
2045            tmp_to_remove.append(output_header)
2046
2047        # Chunk size
2048        if not chunk_size:
2049            chunk_size = config.get("chunk_size", None)
2050
2051        # Parquet partition
2052        if not parquet_partitions:
2053            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
2054        if parquet_partitions and isinstance(parquet_partitions, str):
2055            parquet_partitions = parquet_partitions.split(",")
2056
2057        # Order by
2058        if not order_by:
2059            order_by = param.get("export", {}).get("order_by", "")
2060
2061        # Header in output
2062        header_in_output = param.get("export", {}).get("include_header", False)
2063
2064        # Database
2065        database_source = self.get_connexion()
2066
2067        # Connexion format
2068        connexion_format = self.get_connexion_format()
2069
2070        # Explode infos
2071        if self.get_explode_infos():
2072            self.explode_infos(
2073                prefix=self.get_explode_infos_prefix(),
2074                fields=self.get_explode_infos_fields(),
2075                force=False,
2076            )
2077
2078        # if connexion_format in ["sqlite"] or query:
2079        if connexion_format in ["sqlite"]:
2080
2081            # Export in Parquet
2082            random_tmp = "".join(
2083                random.choice(string.ascii_lowercase) for i in range(10)
2084            )
2085            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
2086            tmp_to_remove.append(database_source)
2087
2088            # Table Variants
2089            table_variants = self.get_table_variants()
2090
2091            # Create export query
2092            sql_query_export_subquery = f"""
2093                SELECT * FROM {table_variants}
2094                """
2095
2096            # Write source file
2097            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
2098
2099        # Create database
2100        database = Database(
2101            database=database_source,
2102            table="variants",
2103            header_file=output_header,
2104            conn_config=self.get_connexion_config(),
2105        )
2106
2107        # Existing colomns header
2108        # existing_columns_header = database.get_header_file_columns(output_header)
2109        existing_columns_header = database.get_header_columns_from_database()
2110
2111        # Export file
2112        database.export(
2113            output_database=output_file,
2114            output_header=output_header,
2115            existing_columns_header=existing_columns_header,
2116            parquet_partitions=parquet_partitions,
2117            chunk_size=chunk_size,
2118            threads=threads,
2119            sort=sort,
2120            index=index,
2121            header_in_output=header_in_output,
2122            order_by=order_by,
2123            query=query,
2124            export_header=export_header,
2125        )
2126
2127        # Remove
2128        remove_if_exists(tmp_to_remove)
2129
2130        return (os.path.exists(output_file) or None) and (
2131            os.path.exists(output_file) or None
2132        )
2133
2134    def get_extra_infos(self, table: str = None) -> list:
2135        """
2136        The `get_extra_infos` function returns a list of columns that are in a specified table but not
2137        in the header.
2138
2139        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
2140        name of the table from which you want to retrieve the extra columns that are not present in the
2141        header. If the `table` parameter is not provided when calling the function, it will default to
2142        using the variants
2143        :type table: str
2144        :return: A list of columns that are in the specified table but not in the header of the table.
2145        """
2146
2147        header_columns = []
2148
2149        if not table:
2150            table = self.get_table_variants(clause="from")
2151            header_columns = self.get_header_columns()
2152
2153        # Check all columns in the database
2154        query = f""" SELECT * FROM {table} LIMIT 1 """
2155        log.debug(f"query {query}")
2156        table_columns = self.get_query_to_df(query).columns.tolist()
2157        extra_columns = []
2158
2159        # Construct extra infos (not in header)
2160        for column in table_columns:
2161            if column not in header_columns:
2162                extra_columns.append(column)
2163
2164        return extra_columns
2165
2166    def get_extra_infos_sql(self, table: str = None) -> str:
2167        """
2168        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
2169        by double quotes
2170
2171        :param table: The name of the table to get the extra infos from. If None, the default table is
2172        used
2173        :type table: str
2174        :return: A string of the extra infos
2175        """
2176
2177        return ", ".join(
2178            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
2179        )
2180
2181    def export_header(
2182        self,
2183        header_name: str = None,
2184        output_file: str = None,
2185        output_file_ext: str = ".hdr",
2186        clean_header: bool = True,
2187        remove_chrom_line: bool = False,
2188    ) -> str:
2189        """
2190        The `export_header` function takes a VCF file, extracts the header, modifies it according to
2191        specified options, and writes it to a new file.
2192
2193        :param header_name: The `header_name` parameter is the name of the header file to be created. If
2194        this parameter is not specified, the header will be written to the output file
2195        :type header_name: str
2196        :param output_file: The `output_file` parameter in the `export_header` function is used to
2197        specify the name of the output file where the header will be written. If this parameter is not
2198        provided, the header will be written to a temporary file
2199        :type output_file: str
2200        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
2201        string that represents the extension of the output header file. By default, it is set to ".hdr"
2202        if not specified by the user. This extension will be appended to the `output_file` name to
2203        create the final, defaults to .hdr
2204        :type output_file_ext: str (optional)
2205        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
2206        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
2207        `True`, the function will clean the header by modifying certain lines based on a specific
2208        pattern. If `clean_header`, defaults to True
2209        :type clean_header: bool (optional)
2210        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
2211        boolean flag that determines whether the #CHROM line should be removed from the header before
2212        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
2213        defaults to False
2214        :type remove_chrom_line: bool (optional)
2215        :return: The function `export_header` returns the name of the temporary header file that is
2216        created.
2217        """
2218
2219        if not header_name and not output_file:
2220            output_file = self.get_output()
2221
2222        if self.get_header():
2223
2224            # Get header object
2225            header_obj = self.get_header()
2226
2227            # Create database
2228            db_for_header = Database(database=self.get_input())
2229
2230            # Get real columns in the file
2231            db_header_columns = db_for_header.get_columns()
2232
2233            with tempfile.TemporaryDirectory() as tmpdir:
2234
2235                # Write header file
2236                header_file_tmp = os.path.join(tmpdir, "header")
2237                f = open(header_file_tmp, "w")
2238                vcf.Writer(f, header_obj)
2239                f.close()
2240
2241                # Replace #CHROM line with rel columns
2242                header_list = db_for_header.read_header_file(
2243                    header_file=header_file_tmp
2244                )
2245                header_list[-1] = "\t".join(db_header_columns)
2246
2247                # Remove CHROM line
2248                if remove_chrom_line:
2249                    header_list.pop()
2250
2251                # Clean header
2252                if clean_header:
2253                    header_list_clean = []
2254                    for head in header_list:
2255                        # Clean head for malformed header
2256                        head_clean = head
2257                        head_clean = re.subn(
2258                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
2259                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
2260                            head_clean,
2261                            2,
2262                        )[0]
2263                        # Write header
2264                        header_list_clean.append(head_clean)
2265                    header_list = header_list_clean
2266
2267            tmp_header_name = output_file + output_file_ext
2268
2269            f = open(tmp_header_name, "w")
2270            for line in header_list:
2271                f.write(line)
2272            f.close()
2273
2274        return tmp_header_name
2275
2276    def export_variant_vcf(
2277        self,
2278        vcf_file,
2279        remove_info: bool = False,
2280        add_samples: bool = True,
2281        list_samples: list = [],
2282        where_clause: str = "",
2283        index: bool = False,
2284        threads: int | None = None,
2285    ) -> bool | None:
2286        """
2287        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
2288        remove INFO field, add samples, and control compression and indexing.
2289
2290        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
2291        written to. It is the output file that will contain the filtered VCF data based on the specified
2292        parameters
2293        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
2294        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
2295        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
2296        in, defaults to False
2297        :type remove_info: bool (optional)
2298        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
2299        the samples should be added to the VCF file or not. If set to True, the samples will be added.
2300        If set to False, the samples will be removed. The default value is True, defaults to True
2301        :type add_samples: bool (optional)
2302        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
2303        in the output VCF file. By default, all samples will be included. If you provide a list of
2304        samples, only those samples will be included in the output file
2305        :type list_samples: list
2306        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
2307        determines whether or not to create an index for the output VCF file. If `index` is set to
2308        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
2309        :type index: bool (optional)
2310        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
2311        number of threads to use for exporting the VCF file. It determines how many parallel threads
2312        will be used during the export process. More threads can potentially speed up the export process
2313        by utilizing multiple cores of the processor. If
2314        :type threads: int | None
2315        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
2316        method with various parameters including the output file, query, threads, sort flag, and index
2317        flag. The `export_output` method is responsible for exporting the VCF data based on the
2318        specified parameters and configurations provided in the `export_variant_vcf` function.
2319        """
2320
2321        # Config
2322        config = self.get_config()
2323
2324        # Extract VCF
2325        log.debug("Export VCF...")
2326
2327        # Table variants
2328        table_variants = self.get_table_variants()
2329
2330        # Threads
2331        if not threads:
2332            threads = self.get_threads()
2333
2334        # Info fields
2335        if remove_info:
2336            if not isinstance(remove_info, str):
2337                remove_info = "."
2338            info_field = f"""'{remove_info}' as INFO"""
2339        else:
2340            info_field = "INFO"
2341
2342        # Samples fields
2343        if add_samples:
2344            if not list_samples:
2345                list_samples = self.get_header_sample_list()
2346            if list_samples:
2347                samples_fields = " , FORMAT , " + " , ".join(list_samples)
2348            else:
2349                samples_fields = ""
2350            log.debug(f"samples_fields: {samples_fields}")
2351        else:
2352            samples_fields = ""
2353
2354        # Where clause
2355        if where_clause is None:
2356            where_clause = ""
2357
2358        # Variants
2359        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
2360        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
2361        log.debug(f"sql_query_select={sql_query_select}")
2362
2363        return self.export_output(
2364            output_file=vcf_file,
2365            output_header=None,
2366            export_header=True,
2367            query=sql_query_select,
2368            parquet_partitions=None,
2369            chunk_size=config.get("chunk_size", None),
2370            threads=threads,
2371            sort=True,
2372            index=index,
2373            order_by=None,
2374        )
2375
2376    def run_commands(self, commands: list = [], threads: int = 1) -> None:
2377        """
2378        It takes a list of commands and runs them in parallel using the number of threads specified
2379
2380        :param commands: A list of commands to run
2381        :param threads: The number of threads to use, defaults to 1 (optional)
2382        """
2383
2384        run_parallel_commands(commands, threads)
2385
2386    def get_threads(self, default: int = 1) -> int:
2387        """
2388        This function returns the number of threads to use for a job, with a default value of 1 if not
2389        specified.
2390
2391        :param default: The `default` parameter in the `get_threads` method is used to specify the
2392        default number of threads to use if no specific value is provided. If no value is provided for
2393        the `threads` parameter in the configuration or input parameters, the `default` value will be
2394        used, defaults to 1
2395        :type default: int (optional)
2396        :return: the number of threads to use for the current job.
2397        """
2398
2399        # Config
2400        config = self.get_config()
2401
2402        # Param
2403        param = self.get_param()
2404
2405        # Input threads
2406        input_thread = param.get("threads", config.get("threads", None))
2407
2408        # Check threads
2409        if not input_thread:
2410            threads = default
2411        elif int(input_thread) <= 0:
2412            threads = os.cpu_count()
2413        else:
2414            threads = int(input_thread)
2415        return threads
2416
2417    def get_memory(self, default: str = None) -> str:
2418        """
2419        This function retrieves the memory value from parameters or configuration with a default value
2420        if not found.
2421
2422        :param default: The `get_memory` function takes in a default value as a string parameter. This
2423        default value is used as a fallback in case the `memory` parameter is not provided in the
2424        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
2425        the function
2426        :type default: str
2427        :return: The `get_memory` function returns a string value representing the memory parameter. If
2428        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
2429        return the default value provided as an argument to the function.
2430        """
2431
2432        # Config
2433        config = self.get_config()
2434
2435        # Param
2436        param = self.get_param()
2437
2438        # Input threads
2439        input_memory = param.get("memory", config.get("memory", None))
2440
2441        # Check threads
2442        if input_memory:
2443            memory = input_memory
2444        else:
2445            memory = default
2446
2447        return memory
2448
2449    def update_from_vcf(self, vcf_file: str) -> None:
2450        """
2451        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
2452
2453        :param vcf_file: the path to the VCF file
2454        """
2455
2456        connexion_format = self.get_connexion_format()
2457
2458        if connexion_format in ["duckdb"]:
2459            self.update_from_vcf_duckdb(vcf_file)
2460        elif connexion_format in ["sqlite"]:
2461            self.update_from_vcf_sqlite(vcf_file)
2462
2463    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2464        """
2465        It takes a VCF file and updates the INFO column of the variants table in the database with the
2466        INFO column of the VCF file
2467
2468        :param vcf_file: the path to the VCF file
2469        """
2470
2471        # varaints table
2472        table_variants = self.get_table_variants()
2473
2474        # Loading VCF into temporaire table
2475        skip = self.get_header_length(file=vcf_file)
2476        vcf_df = pd.read_csv(
2477            vcf_file,
2478            sep="\t",
2479            engine="c",
2480            skiprows=skip,
2481            header=0,
2482            low_memory=False,
2483        )
2484        sql_query_update = f"""
2485        UPDATE {table_variants} as table_variants
2486            SET INFO = concat(
2487                            CASE
2488                                WHEN INFO NOT IN ('', '.')
2489                                THEN INFO
2490                                ELSE ''
2491                            END,
2492                            (
2493                                SELECT 
2494                                    concat(
2495                                        CASE
2496                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
2497                                            THEN ';'
2498                                            ELSE ''
2499                                        END
2500                                        ,
2501                                        CASE
2502                                            WHEN table_parquet.INFO NOT IN ('','.')
2503                                            THEN table_parquet.INFO
2504                                            ELSE ''
2505                                        END
2506                                    )
2507                                FROM vcf_df as table_parquet
2508                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
2509                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
2510                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
2511                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
2512                                        AND table_parquet.INFO NOT IN ('','.')
2513                            )
2514                        )
2515            ;
2516            """
2517        self.conn.execute(sql_query_update)
2518
2519    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2520        """
2521        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
2522        table, then updates the INFO column of the variants table with the INFO column of the temporary
2523        table
2524
2525        :param vcf_file: The path to the VCF file you want to update the database with
2526        """
2527
2528        # Create a temporary table for the VCF
2529        table_vcf = "tmp_vcf"
2530        sql_create = (
2531            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
2532        )
2533        self.conn.execute(sql_create)
2534
2535        # Loading VCF into temporaire table
2536        vcf_df = pd.read_csv(
2537            vcf_file, sep="\t", comment="#", header=None, low_memory=False
2538        )
2539        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
2540        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
2541
2542        # Update table 'variants' with VCF data
2543        # warning: CONCAT as || operator
2544        sql_query_update = f"""
2545            UPDATE variants as table_variants
2546            SET INFO = CASE
2547                            WHEN INFO NOT IN ('', '.')
2548                            THEN INFO
2549                            ELSE ''
2550                        END ||
2551                        (
2552                        SELECT 
2553                            CASE 
2554                                WHEN table_variants.INFO NOT IN ('','.') 
2555                                    AND table_vcf.INFO NOT IN ('','.')  
2556                                THEN ';' 
2557                                ELSE '' 
2558                            END || 
2559                            CASE 
2560                                WHEN table_vcf.INFO NOT IN ('','.') 
2561                                THEN table_vcf.INFO 
2562                                ELSE '' 
2563                            END
2564                        FROM {table_vcf} as table_vcf
2565                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
2566                            AND table_vcf.\"POS\" = table_variants.\"POS\"
2567                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
2568                            AND table_vcf.\"REF\" = table_variants.\"REF\"
2569                        )
2570        """
2571        self.conn.execute(sql_query_update)
2572
2573        # Drop temporary table
2574        sql_drop = f"DROP TABLE {table_vcf}"
2575        self.conn.execute(sql_drop)
2576
2577    def drop_variants_table(self) -> None:
2578        """
2579        > This function drops the variants table
2580        """
2581
2582        table_variants = self.get_table_variants()
2583        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
2584        self.conn.execute(sql_table_variants)
2585
2586    def set_variant_id(
2587        self, variant_id_column: str = "variant_id", force: bool = None
2588    ) -> str:
2589        """
2590        It adds a column to the variants table called `variant_id` and populates it with a hash of the
2591        `#CHROM`, `POS`, `REF`, and `ALT` columns
2592
2593        :param variant_id_column: The name of the column to be created in the variants table, defaults
2594        to variant_id
2595        :type variant_id_column: str (optional)
2596        :param force: If True, the variant_id column will be created even if it already exists
2597        :type force: bool
2598        :return: The name of the column that contains the variant_id
2599        """
2600
2601        # Assembly
2602        assembly = self.get_param().get(
2603            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
2604        )
2605
2606        # INFO/Tag prefix
2607        prefix = self.get_explode_infos_prefix()
2608
2609        # Explode INFO/SVTYPE
2610        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
2611
2612        # variants table
2613        table_variants = self.get_table_variants()
2614
2615        # variant_id column
2616        if not variant_id_column:
2617            variant_id_column = "variant_id"
2618
2619        # Creta variant_id column
2620        if "variant_id" not in self.get_extra_infos() or force:
2621
2622            # Create column
2623            self.add_column(
2624                table_name=table_variants,
2625                column_name=variant_id_column,
2626                column_type="UBIGINT",
2627                default_value="0",
2628            )
2629
2630            # Update column
2631            self.conn.execute(
2632                f"""
2633                    UPDATE {table_variants}
2634                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
2635                """
2636            )
2637
2638        # Remove added columns
2639        for added_column in added_columns:
2640            self.drop_column(column=added_column)
2641
2642        # return variant_id column name
2643        return variant_id_column
2644
2645    def get_variant_id_column(
2646        self, variant_id_column: str = "variant_id", force: bool = None
2647    ) -> str:
2648        """
2649        This function returns the variant_id column name
2650
2651        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
2652        defaults to variant_id
2653        :type variant_id_column: str (optional)
2654        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
2655        False, will only set the variant_id if it is not already set. If None, will set the variant_id
2656        if it is not already set, or if it is set
2657        :type force: bool
2658        :return: The variant_id column name.
2659        """
2660
2661        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
2662
2663    ###
2664    # Annotation
2665    ###
2666
2667    def scan_databases(
2668        self,
2669        database_formats: list = ["parquet"],
2670        database_releases: list = ["current"],
2671    ) -> dict:
2672        """
2673        The function `scan_databases` scans for available databases based on specified formats and
2674        releases.
2675
2676        :param database_formats: The `database_formats` parameter is a list that specifies the formats
2677        of the databases to be scanned. In this case, the accepted format is "parquet"
2678        :type database_formats: list ["parquet"]
2679        :param database_releases: The `database_releases` parameter is a list that specifies the
2680        releases of the databases to be scanned. In the provided function, the default value for
2681        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
2682        databases that are in the "current"
2683        :type database_releases: list
2684        :return: The function `scan_databases` returns a dictionary containing information about
2685        databases that match the specified formats and releases.
2686        """
2687
2688        # Config
2689        config = self.get_config()
2690
2691        # Param
2692        param = self.get_param()
2693
2694        # Param - Assembly
2695        assembly = param.get("assembly", config.get("assembly", None))
2696        if not assembly:
2697            assembly = DEFAULT_ASSEMBLY
2698            log.warning(f"Default assembly '{assembly}'")
2699
2700        # Scan for availabled databases
2701        log.info(
2702            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
2703        )
2704        databases_infos_dict = databases_infos(
2705            database_folder_releases=database_releases,
2706            database_formats=database_formats,
2707            assembly=assembly,
2708            config=config,
2709        )
2710        log.info(
2711            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
2712        )
2713
2714        return databases_infos_dict
2715
2716    def annotation(self) -> None:
2717        """
2718        It annotates the VCF file with the annotations specified in the config file.
2719        """
2720
2721        # Config
2722        config = self.get_config()
2723
2724        # Param
2725        param = self.get_param()
2726
2727        # Param - Assembly
2728        assembly = param.get("assembly", config.get("assembly", None))
2729        if not assembly:
2730            assembly = DEFAULT_ASSEMBLY
2731            log.warning(f"Default assembly '{assembly}'")
2732
2733        # annotations databases folders
2734        annotations_databases = set(
2735            config.get("folders", {})
2736            .get("databases", {})
2737            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
2738            + config.get("folders", {})
2739            .get("databases", {})
2740            .get("parquet", ["~/howard/databases/parquet/current"])
2741            + config.get("folders", {})
2742            .get("databases", {})
2743            .get("bcftools", ["~/howard/databases/bcftools/current"])
2744        )
2745
2746        # Get param annotations
2747        if param.get("annotations", None) and isinstance(
2748            param.get("annotations", None), str
2749        ):
2750            log.debug(param.get("annotations", None))
2751            param_annotation_list = param.get("annotations").split(",")
2752        else:
2753            param_annotation_list = []
2754
2755        # Each tools param
2756        if param.get("annotation_parquet", None) != None:
2757            log.debug(
2758                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
2759            )
2760            if isinstance(param.get("annotation_parquet", None), list):
2761                param_annotation_list.append(",".join(param.get("annotation_parquet")))
2762            else:
2763                param_annotation_list.append(param.get("annotation_parquet"))
2764        if param.get("annotation_snpsift", None) != None:
2765            if isinstance(param.get("annotation_snpsift", None), list):
2766                param_annotation_list.append(
2767                    "snpsift:"
2768                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
2769                )
2770            else:
2771                param_annotation_list.append(
2772                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
2773                )
2774        if param.get("annotation_snpeff", None) != None:
2775            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
2776        if param.get("annotation_bcftools", None) != None:
2777            if isinstance(param.get("annotation_bcftools", None), list):
2778                param_annotation_list.append(
2779                    "bcftools:"
2780                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
2781                )
2782            else:
2783                param_annotation_list.append(
2784                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
2785                )
2786        if param.get("annotation_annovar", None) != None:
2787            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
2788        if param.get("annotation_exomiser", None) != None:
2789            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
2790        if param.get("annotation_splice", None) != None:
2791            param_annotation_list.append("splice:" + param.get("annotation_splice"))
2792
2793        # Merge param annotations list
2794        param["annotations"] = ",".join(param_annotation_list)
2795
2796        # debug
2797        log.debug(f"param_annotations={param['annotations']}")
2798
2799        if param.get("annotations"):
2800
2801            # Log
2802            # log.info("Annotations - Check annotation parameters")
2803
2804            if not "annotation" in param:
2805                param["annotation"] = {}
2806
2807            # List of annotations parameters
2808            annotations_list_input = {}
2809            if isinstance(param.get("annotations", None), str):
2810                annotation_file_list = [
2811                    value for value in param.get("annotations", "").split(",")
2812                ]
2813                for annotation_file in annotation_file_list:
2814                    annotations_list_input[annotation_file] = {"INFO": None}
2815            else:
2816                annotations_list_input = param.get("annotations", {})
2817
2818            log.info(f"Quick Annotations:")
2819            for annotation_key in list(annotations_list_input.keys()):
2820                log.info(f"   {annotation_key}")
2821
2822            # List of annotations and associated fields
2823            annotations_list = {}
2824
2825            for annotation_file in annotations_list_input:
2826
2827                # Explode annotations if ALL
2828                if (
2829                    annotation_file.upper() == "ALL"
2830                    or annotation_file.upper().startswith("ALL:")
2831                ):
2832
2833                    # check ALL parameters (formats, releases)
2834                    annotation_file_split = annotation_file.split(":")
2835                    database_formats = "parquet"
2836                    database_releases = "current"
2837                    for annotation_file_option in annotation_file_split[1:]:
2838                        database_all_options_split = annotation_file_option.split("=")
2839                        if database_all_options_split[0] == "format":
2840                            database_formats = database_all_options_split[1].split("+")
2841                        if database_all_options_split[0] == "release":
2842                            database_releases = database_all_options_split[1].split("+")
2843
2844                    # Scan for availabled databases
2845                    databases_infos_dict = self.scan_databases(
2846                        database_formats=database_formats,
2847                        database_releases=database_releases,
2848                    )
2849
2850                    # Add found databases in annotation parameters
2851                    for database_infos in databases_infos_dict.keys():
2852                        annotations_list[database_infos] = {"INFO": None}
2853
2854                else:
2855                    annotations_list[annotation_file] = annotations_list_input[
2856                        annotation_file
2857                    ]
2858
2859            # Check each databases
2860            if len(annotations_list):
2861
2862                log.info(
2863                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
2864                )
2865
2866                for annotation_file in annotations_list:
2867
2868                    # Init
2869                    annotations = annotations_list.get(annotation_file, None)
2870
2871                    # Annotation snpEff
2872                    if annotation_file.startswith("snpeff"):
2873
2874                        log.debug(f"Quick Annotation snpEff")
2875
2876                        if "snpeff" not in param["annotation"]:
2877                            param["annotation"]["snpeff"] = {}
2878
2879                        if "options" not in param["annotation"]["snpeff"]:
2880                            param["annotation"]["snpeff"]["options"] = ""
2881
2882                        # snpEff options in annotations
2883                        param["annotation"]["snpeff"]["options"] = "".join(
2884                            annotation_file.split(":")[1:]
2885                        )
2886
2887                    # Annotation Annovar
2888                    elif annotation_file.startswith("annovar"):
2889
2890                        log.debug(f"Quick Annotation Annovar")
2891
2892                        if "annovar" not in param["annotation"]:
2893                            param["annotation"]["annovar"] = {}
2894
2895                        if "annotations" not in param["annotation"]["annovar"]:
2896                            param["annotation"]["annovar"]["annotations"] = {}
2897
2898                        # Options
2899                        annotation_file_split = annotation_file.split(":")
2900                        for annotation_file_annotation in annotation_file_split[1:]:
2901                            if annotation_file_annotation:
2902                                param["annotation"]["annovar"]["annotations"][
2903                                    annotation_file_annotation
2904                                ] = annotations
2905
2906                    # Annotation Exomiser
2907                    elif annotation_file.startswith("exomiser"):
2908
2909                        log.debug(f"Quick Annotation Exomiser")
2910
2911                        param["annotation"]["exomiser"] = params_string_to_dict(
2912                            annotation_file
2913                        )
2914
2915                    # Annotation Splice
2916                    elif annotation_file.startswith("splice"):
2917
2918                        log.debug(f"Quick Annotation Splice")
2919
2920                        param["annotation"]["splice"] = params_string_to_dict(
2921                            annotation_file
2922                        )
2923
2924                    # Annotation Parquet or BCFTOOLS
2925                    else:
2926
2927                        # Tools detection
2928                        if annotation_file.startswith("bcftools:"):
2929                            annotation_tool_initial = "bcftools"
2930                            annotation_file = ":".join(annotation_file.split(":")[1:])
2931                        elif annotation_file.startswith("snpsift:"):
2932                            annotation_tool_initial = "snpsift"
2933                            annotation_file = ":".join(annotation_file.split(":")[1:])
2934                        else:
2935                            annotation_tool_initial = None
2936
2937                        # list of files
2938                        annotation_file_list = annotation_file.replace("+", ":").split(
2939                            ":"
2940                        )
2941
2942                        for annotation_file in annotation_file_list:
2943
2944                            if annotation_file:
2945
2946                                # Annotation tool initial
2947                                annotation_tool = annotation_tool_initial
2948
2949                                # Find file
2950                                annotation_file_found = None
2951
2952                                # Expand user
2953                                annotation_file = full_path(annotation_file)
2954
2955                                if os.path.exists(annotation_file):
2956                                    annotation_file_found = annotation_file
2957
2958                                else:
2959                                    # Find within assembly folders
2960                                    for annotations_database in annotations_databases:
2961                                        found_files = find_all(
2962                                            annotation_file,
2963                                            os.path.join(
2964                                                annotations_database, assembly
2965                                            ),
2966                                        )
2967                                        if len(found_files) > 0:
2968                                            annotation_file_found = found_files[0]
2969                                            break
2970                                    if not annotation_file_found and not assembly:
2971                                        # Find within folders
2972                                        for (
2973                                            annotations_database
2974                                        ) in annotations_databases:
2975                                            found_files = find_all(
2976                                                annotation_file, annotations_database
2977                                            )
2978                                            if len(found_files) > 0:
2979                                                annotation_file_found = found_files[0]
2980                                                break
2981                                log.debug(
2982                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
2983                                )
2984
2985                                # Full path
2986                                annotation_file_found = full_path(annotation_file_found)
2987
2988                                if annotation_file_found:
2989
2990                                    database = Database(database=annotation_file_found)
2991                                    quick_annotation_format = database.get_format()
2992                                    quick_annotation_is_compressed = (
2993                                        database.is_compressed()
2994                                    )
2995                                    quick_annotation_is_indexed = os.path.exists(
2996                                        f"{annotation_file_found}.tbi"
2997                                    )
2998                                    bcftools_preference = False
2999
3000                                    # Check Annotation Tool
3001                                    if not annotation_tool:
3002                                        if (
3003                                            bcftools_preference
3004                                            and quick_annotation_format
3005                                            in ["vcf", "bed"]
3006                                            and quick_annotation_is_compressed
3007                                            and quick_annotation_is_indexed
3008                                        ):
3009                                            annotation_tool = "bcftools"
3010                                        elif quick_annotation_format in [
3011                                            "vcf",
3012                                            "bed",
3013                                            "tsv",
3014                                            "tsv",
3015                                            "csv",
3016                                            "json",
3017                                            "tbl",
3018                                            "parquet",
3019                                            "duckdb",
3020                                        ]:
3021                                            annotation_tool = "parquet"
3022                                        else:
3023                                            log.error(
3024                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3025                                            )
3026                                            raise ValueError(
3027                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3028                                            )
3029
3030                                    log.debug(
3031                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
3032                                    )
3033
3034                                    # Annotation Tool dispatch
3035                                    if annotation_tool:
3036                                        if annotation_tool not in param["annotation"]:
3037                                            param["annotation"][annotation_tool] = {}
3038                                        if (
3039                                            "annotations"
3040                                            not in param["annotation"][annotation_tool]
3041                                        ):
3042                                            param["annotation"][annotation_tool][
3043                                                "annotations"
3044                                            ] = {}
3045                                        param["annotation"][annotation_tool][
3046                                            "annotations"
3047                                        ][annotation_file_found] = annotations
3048
3049                                else:
3050                                    log.error(
3051                                        f"Quick Annotation File {annotation_file} does NOT exist"
3052                                    )
3053
3054                self.set_param(param)
3055
3056        if param.get("annotation", None):
3057            log.info("Annotations")
3058            if param.get("annotation", {}).get("parquet", None):
3059                log.info("Annotations 'parquet'...")
3060                self.annotation_parquet()
3061            if param.get("annotation", {}).get("bcftools", None):
3062                log.info("Annotations 'bcftools'...")
3063                self.annotation_bcftools()
3064            if param.get("annotation", {}).get("snpsift", None):
3065                log.info("Annotations 'snpsift'...")
3066                self.annotation_snpsift()
3067            if param.get("annotation", {}).get("annovar", None):
3068                log.info("Annotations 'annovar'...")
3069                self.annotation_annovar()
3070            if param.get("annotation", {}).get("snpeff", None):
3071                log.info("Annotations 'snpeff'...")
3072                self.annotation_snpeff()
3073            if param.get("annotation", {}).get("exomiser", None) is not None:
3074                log.info("Annotations 'exomiser'...")
3075                self.annotation_exomiser()
3076            if param.get("annotation", {}).get("splice", None) is not None:
3077                log.info("Annotations 'splice' ...")
3078                self.annotation_splice()
3079
3080        # Explode INFOS fields into table fields
3081        if self.get_explode_infos():
3082            self.explode_infos(
3083                prefix=self.get_explode_infos_prefix(),
3084                fields=self.get_explode_infos_fields(),
3085                force=True,
3086            )
3087
3088    def annotation_snpsift(self, threads: int = None) -> None:
3089        """
3090        This function annotate with bcftools
3091
3092        :param threads: Number of threads to use
3093        :return: the value of the variable "return_value".
3094        """
3095
3096        # DEBUG
3097        log.debug("Start annotation with bcftools databases")
3098
3099        # Threads
3100        if not threads:
3101            threads = self.get_threads()
3102        log.debug("Threads: " + str(threads))
3103
3104        # Config
3105        config = self.get_config()
3106        log.debug("Config: " + str(config))
3107
3108        # Config - snpSift
3109        snpsift_bin_command = get_bin_command(
3110            bin="SnpSift.jar",
3111            tool="snpsift",
3112            bin_type="jar",
3113            config=config,
3114            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
3115        )
3116        if not snpsift_bin_command:
3117            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
3118            log.error(msg_err)
3119            raise ValueError(msg_err)
3120
3121        # Config - bcftools
3122        bcftools_bin_command = get_bin_command(
3123            bin="bcftools",
3124            tool="bcftools",
3125            bin_type="bin",
3126            config=config,
3127            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3128        )
3129        if not bcftools_bin_command:
3130            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3131            log.error(msg_err)
3132            raise ValueError(msg_err)
3133
3134        # Config - BCFTools databases folders
3135        databases_folders = set(
3136            self.get_config()
3137            .get("folders", {})
3138            .get("databases", {})
3139            .get("annotations", ["."])
3140            + self.get_config()
3141            .get("folders", {})
3142            .get("databases", {})
3143            .get("bcftools", ["."])
3144        )
3145        log.debug("Databases annotations: " + str(databases_folders))
3146
3147        # Param
3148        annotations = (
3149            self.get_param()
3150            .get("annotation", {})
3151            .get("snpsift", {})
3152            .get("annotations", None)
3153        )
3154        log.debug("Annotations: " + str(annotations))
3155
3156        # Assembly
3157        assembly = self.get_param().get(
3158            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3159        )
3160
3161        # Data
3162        table_variants = self.get_table_variants()
3163
3164        # Check if not empty
3165        log.debug("Check if not empty")
3166        sql_query_chromosomes = (
3167            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3168        )
3169        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3170        if not sql_query_chromosomes_df["count"][0]:
3171            log.info(f"VCF empty")
3172            return
3173
3174        # VCF header
3175        vcf_reader = self.get_header()
3176        log.debug("Initial header: " + str(vcf_reader.infos))
3177
3178        # Existing annotations
3179        for vcf_annotation in self.get_header().infos:
3180
3181            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3182            log.debug(
3183                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3184            )
3185
3186        if annotations:
3187
3188            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3189
3190                # Export VCF file
3191                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3192
3193                # Init
3194                commands = {}
3195
3196                for annotation in annotations:
3197                    annotation_fields = annotations[annotation]
3198
3199                    # Annotation Name
3200                    annotation_name = os.path.basename(annotation)
3201
3202                    if not annotation_fields:
3203                        annotation_fields = {"INFO": None}
3204
3205                    log.debug(f"Annotation '{annotation_name}'")
3206                    log.debug(
3207                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3208                    )
3209
3210                    # Create Database
3211                    database = Database(
3212                        database=annotation,
3213                        databases_folders=databases_folders,
3214                        assembly=assembly,
3215                    )
3216
3217                    # Find files
3218                    db_file = database.get_database()
3219                    db_file = full_path(db_file)
3220                    db_hdr_file = database.get_header_file()
3221                    db_hdr_file = full_path(db_hdr_file)
3222                    db_file_type = database.get_format()
3223                    db_tbi_file = f"{db_file}.tbi"
3224                    db_file_compressed = database.is_compressed()
3225
3226                    # Check if compressed
3227                    if not db_file_compressed:
3228                        log.error(
3229                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3230                        )
3231                        raise ValueError(
3232                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3233                        )
3234
3235                    # Check if indexed
3236                    if not os.path.exists(db_tbi_file):
3237                        log.error(
3238                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3239                        )
3240                        raise ValueError(
3241                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3242                        )
3243
3244                    # Check index - try to create if not exists
3245                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3246                        log.error("Annotation failed: database not valid")
3247                        log.error(f"Annotation annotation file: {db_file}")
3248                        log.error(f"Annotation annotation header: {db_hdr_file}")
3249                        log.error(f"Annotation annotation index: {db_tbi_file}")
3250                        raise ValueError(
3251                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3252                        )
3253                    else:
3254
3255                        log.debug(
3256                            f"Annotation '{annotation}' - file: "
3257                            + str(db_file)
3258                            + " and "
3259                            + str(db_hdr_file)
3260                        )
3261
3262                        # Load header as VCF object
3263                        db_hdr_vcf = Variants(input=db_hdr_file)
3264                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3265                        log.debug(
3266                            "Annotation database header: "
3267                            + str(db_hdr_vcf_header_infos)
3268                        )
3269
3270                        # For all fields in database
3271                        annotation_fields_full = False
3272                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3273                            annotation_fields = {
3274                                key: key for key in db_hdr_vcf_header_infos
3275                            }
3276                            log.debug(
3277                                "Annotation database header - All annotations added: "
3278                                + str(annotation_fields)
3279                            )
3280                            annotation_fields_full = True
3281
3282                        # # Create file for field rename
3283                        # log.debug("Create file for field rename")
3284                        # tmp_rename = NamedTemporaryFile(
3285                        #     prefix=self.get_prefix(),
3286                        #     dir=self.get_tmp_dir(),
3287                        #     suffix=".rename",
3288                        #     delete=False,
3289                        # )
3290                        # tmp_rename_name = tmp_rename.name
3291                        # tmp_files.append(tmp_rename_name)
3292
3293                        # Number of fields
3294                        nb_annotation_field = 0
3295                        annotation_list = []
3296                        annotation_infos_rename_list = []
3297
3298                        for annotation_field in annotation_fields:
3299
3300                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3301                            annotation_fields_new_name = annotation_fields.get(
3302                                annotation_field, annotation_field
3303                            )
3304                            if not annotation_fields_new_name:
3305                                annotation_fields_new_name = annotation_field
3306
3307                            # Check if field is in DB and if field is not elready in input data
3308                            if (
3309                                annotation_field in db_hdr_vcf.get_header().infos
3310                                and annotation_fields_new_name
3311                                not in self.get_header().infos
3312                            ):
3313
3314                                log.info(
3315                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3316                                )
3317
3318                                # BCFTools annotate param to rename fields
3319                                if annotation_field != annotation_fields_new_name:
3320                                    annotation_infos_rename_list.append(
3321                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3322                                    )
3323
3324                                # Add INFO field to header
3325                                db_hdr_vcf_header_infos_number = (
3326                                    db_hdr_vcf_header_infos[annotation_field].num or "."
3327                                )
3328                                db_hdr_vcf_header_infos_type = (
3329                                    db_hdr_vcf_header_infos[annotation_field].type
3330                                    or "String"
3331                                )
3332                                db_hdr_vcf_header_infos_description = (
3333                                    db_hdr_vcf_header_infos[annotation_field].desc
3334                                    or f"{annotation_field} description"
3335                                )
3336                                db_hdr_vcf_header_infos_source = (
3337                                    db_hdr_vcf_header_infos[annotation_field].source
3338                                    or "unknown"
3339                                )
3340                                db_hdr_vcf_header_infos_version = (
3341                                    db_hdr_vcf_header_infos[annotation_field].version
3342                                    or "unknown"
3343                                )
3344
3345                                vcf_reader.infos[annotation_fields_new_name] = (
3346                                    vcf.parser._Info(
3347                                        annotation_fields_new_name,
3348                                        db_hdr_vcf_header_infos_number,
3349                                        db_hdr_vcf_header_infos_type,
3350                                        db_hdr_vcf_header_infos_description,
3351                                        db_hdr_vcf_header_infos_source,
3352                                        db_hdr_vcf_header_infos_version,
3353                                        self.code_type_map[
3354                                            db_hdr_vcf_header_infos_type
3355                                        ],
3356                                    )
3357                                )
3358
3359                                annotation_list.append(annotation_field)
3360
3361                                nb_annotation_field += 1
3362
3363                            else:
3364
3365                                if (
3366                                    annotation_field
3367                                    not in db_hdr_vcf.get_header().infos
3368                                ):
3369                                    log.warning(
3370                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
3371                                    )
3372                                if (
3373                                    annotation_fields_new_name
3374                                    in self.get_header().infos
3375                                ):
3376                                    log.warning(
3377                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
3378                                    )
3379
3380                        log.info(
3381                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3382                        )
3383
3384                        annotation_infos = ",".join(annotation_list)
3385
3386                        if annotation_infos != "":
3387
3388                            # Annotated VCF (and error file)
3389                            tmp_annotation_vcf_name = os.path.join(
3390                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
3391                            )
3392                            tmp_annotation_vcf_name_err = (
3393                                tmp_annotation_vcf_name + ".err"
3394                            )
3395
3396                            # Add fields to annotate
3397                            if not annotation_fields_full:
3398                                annotation_infos_option = f"-info {annotation_infos}"
3399                            else:
3400                                annotation_infos_option = ""
3401
3402                            # Info fields rename
3403                            if annotation_infos_rename_list:
3404                                annotation_infos_rename = " -c " + ",".join(
3405                                    annotation_infos_rename_list
3406                                )
3407                            else:
3408                                annotation_infos_rename = ""
3409
3410                            # Annotate command
3411                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3412
3413                            # Add command
3414                            commands[command_annotate] = tmp_annotation_vcf_name
3415
3416                if commands:
3417
3418                    # Export VCF file
3419                    self.export_variant_vcf(
3420                        vcf_file=tmp_vcf_name,
3421                        remove_info=True,
3422                        add_samples=False,
3423                        index=True,
3424                    )
3425                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
3426
3427                    # Num command
3428                    nb_command = 0
3429
3430                    # Annotate
3431                    for command_annotate in commands:
3432                        nb_command += 1
3433                        log.info(
3434                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
3435                        )
3436                        log.debug(f"command_annotate={command_annotate}")
3437                        run_parallel_commands([command_annotate], threads)
3438
3439                        # Debug
3440                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
3441
3442                        # Update variants
3443                        log.info(
3444                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
3445                        )
3446                        self.update_from_vcf(commands[command_annotate])
3447
3448    def annotation_bcftools(self, threads: int = None) -> None:
3449        """
3450        This function annotate with bcftools
3451
3452        :param threads: Number of threads to use
3453        :return: the value of the variable "return_value".
3454        """
3455
3456        # DEBUG
3457        log.debug("Start annotation with bcftools databases")
3458
3459        # Threads
3460        if not threads:
3461            threads = self.get_threads()
3462        log.debug("Threads: " + str(threads))
3463
3464        # Config
3465        config = self.get_config()
3466        log.debug("Config: " + str(config))
3467
3468        # DEBUG
3469        delete_tmp = True
3470        if self.get_config().get("verbosity", "warning") in ["debug"]:
3471            delete_tmp = False
3472            log.debug("Delete tmp files/folders: " + str(delete_tmp))
3473
3474        # Config - BCFTools bin command
3475        bcftools_bin_command = get_bin_command(
3476            bin="bcftools",
3477            tool="bcftools",
3478            bin_type="bin",
3479            config=config,
3480            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3481        )
3482        if not bcftools_bin_command:
3483            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3484            log.error(msg_err)
3485            raise ValueError(msg_err)
3486
3487        # Config - BCFTools databases folders
3488        databases_folders = set(
3489            self.get_config()
3490            .get("folders", {})
3491            .get("databases", {})
3492            .get("annotations", ["."])
3493            + self.get_config()
3494            .get("folders", {})
3495            .get("databases", {})
3496            .get("bcftools", ["."])
3497        )
3498        log.debug("Databases annotations: " + str(databases_folders))
3499
3500        # Param
3501        annotations = (
3502            self.get_param()
3503            .get("annotation", {})
3504            .get("bcftools", {})
3505            .get("annotations", None)
3506        )
3507        log.debug("Annotations: " + str(annotations))
3508
3509        # Assembly
3510        assembly = self.get_param().get(
3511            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3512        )
3513
3514        # Data
3515        table_variants = self.get_table_variants()
3516
3517        # Check if not empty
3518        log.debug("Check if not empty")
3519        sql_query_chromosomes = (
3520            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3521        )
3522        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3523        if not sql_query_chromosomes_df["count"][0]:
3524            log.info(f"VCF empty")
3525            return
3526
3527        # Export in VCF
3528        log.debug("Create initial file to annotate")
3529        tmp_vcf = NamedTemporaryFile(
3530            prefix=self.get_prefix(),
3531            dir=self.get_tmp_dir(),
3532            suffix=".vcf.gz",
3533            delete=False,
3534        )
3535        tmp_vcf_name = tmp_vcf.name
3536
3537        # VCF header
3538        vcf_reader = self.get_header()
3539        log.debug("Initial header: " + str(vcf_reader.infos))
3540
3541        # Existing annotations
3542        for vcf_annotation in self.get_header().infos:
3543
3544            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3545            log.debug(
3546                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3547            )
3548
3549        if annotations:
3550
3551            tmp_ann_vcf_list = []
3552            commands = []
3553            tmp_files = []
3554            err_files = []
3555
3556            for annotation in annotations:
3557                annotation_fields = annotations[annotation]
3558
3559                # Annotation Name
3560                annotation_name = os.path.basename(annotation)
3561
3562                if not annotation_fields:
3563                    annotation_fields = {"INFO": None}
3564
3565                log.debug(f"Annotation '{annotation_name}'")
3566                log.debug(
3567                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3568                )
3569
3570                # Create Database
3571                database = Database(
3572                    database=annotation,
3573                    databases_folders=databases_folders,
3574                    assembly=assembly,
3575                )
3576
3577                # Find files
3578                db_file = database.get_database()
3579                db_file = full_path(db_file)
3580                db_hdr_file = database.get_header_file()
3581                db_hdr_file = full_path(db_hdr_file)
3582                db_file_type = database.get_format()
3583                db_tbi_file = f"{db_file}.tbi"
3584                db_file_compressed = database.is_compressed()
3585
3586                # Check if compressed
3587                if not db_file_compressed:
3588                    log.error(
3589                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3590                    )
3591                    raise ValueError(
3592                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3593                    )
3594
3595                # Check if indexed
3596                if not os.path.exists(db_tbi_file):
3597                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
3598                    raise ValueError(
3599                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
3600                    )
3601
3602                # Check index - try to create if not exists
3603                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3604                    log.error("Annotation failed: database not valid")
3605                    log.error(f"Annotation annotation file: {db_file}")
3606                    log.error(f"Annotation annotation header: {db_hdr_file}")
3607                    log.error(f"Annotation annotation index: {db_tbi_file}")
3608                    raise ValueError(
3609                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3610                    )
3611                else:
3612
3613                    log.debug(
3614                        f"Annotation '{annotation}' - file: "
3615                        + str(db_file)
3616                        + " and "
3617                        + str(db_hdr_file)
3618                    )
3619
3620                    # Load header as VCF object
3621                    db_hdr_vcf = Variants(input=db_hdr_file)
3622                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3623                    log.debug(
3624                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
3625                    )
3626
3627                    # For all fields in database
3628                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
3629                        annotation_fields = {
3630                            key: key for key in db_hdr_vcf_header_infos
3631                        }
3632                        log.debug(
3633                            "Annotation database header - All annotations added: "
3634                            + str(annotation_fields)
3635                        )
3636
3637                    # Number of fields
3638                    nb_annotation_field = 0
3639                    annotation_list = []
3640
3641                    for annotation_field in annotation_fields:
3642
3643                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3644                        annotation_fields_new_name = annotation_fields.get(
3645                            annotation_field, annotation_field
3646                        )
3647                        if not annotation_fields_new_name:
3648                            annotation_fields_new_name = annotation_field
3649
3650                        # Check if field is in DB and if field is not elready in input data
3651                        if (
3652                            annotation_field in db_hdr_vcf.get_header().infos
3653                            and annotation_fields_new_name
3654                            not in self.get_header().infos
3655                        ):
3656
3657                            log.info(
3658                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3659                            )
3660
3661                            # Add INFO field to header
3662                            db_hdr_vcf_header_infos_number = (
3663                                db_hdr_vcf_header_infos[annotation_field].num or "."
3664                            )
3665                            db_hdr_vcf_header_infos_type = (
3666                                db_hdr_vcf_header_infos[annotation_field].type
3667                                or "String"
3668                            )
3669                            db_hdr_vcf_header_infos_description = (
3670                                db_hdr_vcf_header_infos[annotation_field].desc
3671                                or f"{annotation_field} description"
3672                            )
3673                            db_hdr_vcf_header_infos_source = (
3674                                db_hdr_vcf_header_infos[annotation_field].source
3675                                or "unknown"
3676                            )
3677                            db_hdr_vcf_header_infos_version = (
3678                                db_hdr_vcf_header_infos[annotation_field].version
3679                                or "unknown"
3680                            )
3681
3682                            vcf_reader.infos[annotation_fields_new_name] = (
3683                                vcf.parser._Info(
3684                                    annotation_fields_new_name,
3685                                    db_hdr_vcf_header_infos_number,
3686                                    db_hdr_vcf_header_infos_type,
3687                                    db_hdr_vcf_header_infos_description,
3688                                    db_hdr_vcf_header_infos_source,
3689                                    db_hdr_vcf_header_infos_version,
3690                                    self.code_type_map[db_hdr_vcf_header_infos_type],
3691                                )
3692                            )
3693
3694                            # annotation_list.append(annotation_field)
3695                            if annotation_field != annotation_fields_new_name:
3696                                annotation_list.append(
3697                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3698                                )
3699                            else:
3700                                annotation_list.append(annotation_field)
3701
3702                            nb_annotation_field += 1
3703
3704                        else:
3705
3706                            if annotation_field not in db_hdr_vcf.get_header().infos:
3707                                log.warning(
3708                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
3709                                )
3710                            if annotation_fields_new_name in self.get_header().infos:
3711                                log.warning(
3712                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
3713                                )
3714
3715                    log.info(
3716                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3717                    )
3718
3719                    annotation_infos = ",".join(annotation_list)
3720
3721                    if annotation_infos != "":
3722
3723                        # Protect header for bcftools (remove "#CHROM" and variants line)
3724                        log.debug("Protect Header file - remove #CHROM line if exists")
3725                        tmp_header_vcf = NamedTemporaryFile(
3726                            prefix=self.get_prefix(),
3727                            dir=self.get_tmp_dir(),
3728                            suffix=".hdr",
3729                            delete=False,
3730                        )
3731                        tmp_header_vcf_name = tmp_header_vcf.name
3732                        tmp_files.append(tmp_header_vcf_name)
3733                        # Command
3734                        if db_hdr_file.endswith(".gz"):
3735                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3736                        else:
3737                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3738                        # Run
3739                        run_parallel_commands([command_extract_header], 1)
3740
3741                        # Find chomosomes
3742                        log.debug("Find chromosomes ")
3743                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
3744                        sql_query_chromosomes_df = self.get_query_to_df(
3745                            sql_query_chromosomes
3746                        )
3747                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
3748
3749                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
3750
3751                        # BED columns in the annotation file
3752                        if db_file_type in ["bed"]:
3753                            annotation_infos = "CHROM,POS,POS," + annotation_infos
3754
3755                        for chrom in chomosomes_list:
3756
3757                            # Create BED on initial VCF
3758                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
3759                            tmp_bed = NamedTemporaryFile(
3760                                prefix=self.get_prefix(),
3761                                dir=self.get_tmp_dir(),
3762                                suffix=".bed",
3763                                delete=False,
3764                            )
3765                            tmp_bed_name = tmp_bed.name
3766                            tmp_files.append(tmp_bed_name)
3767
3768                            # Detecte regions
3769                            log.debug(
3770                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
3771                            )
3772                            window = 1000000
3773                            sql_query_intervals_for_bed = f"""
3774                                SELECT  \"#CHROM\",
3775                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
3776                                        \"POS\"+{window}
3777                                FROM {table_variants} as table_variants
3778                                WHERE table_variants.\"#CHROM\" = '{chrom}'
3779                            """
3780                            regions = self.conn.execute(
3781                                sql_query_intervals_for_bed
3782                            ).fetchall()
3783                            merged_regions = merge_regions(regions)
3784                            log.debug(
3785                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
3786                            )
3787
3788                            header = ["#CHROM", "START", "END"]
3789                            with open(tmp_bed_name, "w") as f:
3790                                # Write the header with tab delimiter
3791                                f.write("\t".join(header) + "\n")
3792                                for d in merged_regions:
3793                                    # Write each data row with tab delimiter
3794                                    f.write("\t".join(map(str, d)) + "\n")
3795
3796                            # Tmp files
3797                            tmp_annotation_vcf = NamedTemporaryFile(
3798                                prefix=self.get_prefix(),
3799                                dir=self.get_tmp_dir(),
3800                                suffix=".vcf.gz",
3801                                delete=False,
3802                            )
3803                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
3804                            tmp_files.append(tmp_annotation_vcf_name)
3805                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
3806                            tmp_annotation_vcf_name_err = (
3807                                tmp_annotation_vcf_name + ".err"
3808                            )
3809                            err_files.append(tmp_annotation_vcf_name_err)
3810
3811                            # Annotate Command
3812                            log.debug(
3813                                f"Annotation '{annotation}' - add bcftools command"
3814                            )
3815
3816                            # Command
3817                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3818
3819                            # Add command
3820                            commands.append(command_annotate)
3821
3822            # if some commands
3823            if commands:
3824
3825                # Export VCF file
3826                self.export_variant_vcf(
3827                    vcf_file=tmp_vcf_name,
3828                    remove_info=True,
3829                    add_samples=False,
3830                    index=True,
3831                )
3832
3833                # Threads
3834                # calculate threads for annotated commands
3835                if commands:
3836                    threads_bcftools_annotate = round(threads / len(commands))
3837                else:
3838                    threads_bcftools_annotate = 1
3839
3840                if not threads_bcftools_annotate:
3841                    threads_bcftools_annotate = 1
3842
3843                # Add threads option to bcftools commands
3844                if threads_bcftools_annotate > 1:
3845                    commands_threaded = []
3846                    for command in commands:
3847                        commands_threaded.append(
3848                            command.replace(
3849                                f"{bcftools_bin_command} annotate ",
3850                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
3851                            )
3852                        )
3853                    commands = commands_threaded
3854
3855                # Command annotation multithreading
3856                log.debug(f"Annotation - Annotation commands: " + str(commands))
3857                log.info(
3858                    f"Annotation - Annotation multithreaded in "
3859                    + str(len(commands))
3860                    + " commands"
3861                )
3862
3863                run_parallel_commands(commands, threads)
3864
3865                # Merge
3866                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
3867
3868                if tmp_ann_vcf_list_cmd:
3869
3870                    # Tmp file
3871                    tmp_annotate_vcf = NamedTemporaryFile(
3872                        prefix=self.get_prefix(),
3873                        dir=self.get_tmp_dir(),
3874                        suffix=".vcf.gz",
3875                        delete=True,
3876                    )
3877                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
3878                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
3879                    err_files.append(tmp_annotate_vcf_name_err)
3880
3881                    # Tmp file remove command
3882                    tmp_files_remove_command = ""
3883                    if tmp_files:
3884                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
3885
3886                    # Command merge
3887                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
3888                    log.info(
3889                        f"Annotation - Annotation merging "
3890                        + str(len(commands))
3891                        + " annotated files"
3892                    )
3893                    log.debug(f"Annotation - merge command: {merge_command}")
3894                    run_parallel_commands([merge_command], 1)
3895
3896                    # Error messages
3897                    log.info(f"Error/Warning messages:")
3898                    error_message_command_all = []
3899                    error_message_command_warning = []
3900                    error_message_command_err = []
3901                    for err_file in err_files:
3902                        with open(err_file, "r") as f:
3903                            for line in f:
3904                                message = line.strip()
3905                                error_message_command_all.append(message)
3906                                if line.startswith("[W::"):
3907                                    error_message_command_warning.append(message)
3908                                if line.startswith("[E::"):
3909                                    error_message_command_err.append(
3910                                        f"{err_file}: " + message
3911                                    )
3912                    # log info
3913                    for message in list(
3914                        set(error_message_command_err + error_message_command_warning)
3915                    ):
3916                        log.info(f"   {message}")
3917                    # debug info
3918                    for message in list(set(error_message_command_all)):
3919                        log.debug(f"   {message}")
3920                    # failed
3921                    if len(error_message_command_err):
3922                        log.error("Annotation failed: Error in commands")
3923                        raise ValueError("Annotation failed: Error in commands")
3924
3925                    # Update variants
3926                    log.info(f"Annotation - Updating...")
3927                    self.update_from_vcf(tmp_annotate_vcf_name)
3928
3929    def annotation_exomiser(self, threads: int = None) -> None:
3930        """
3931        This function annotate with Exomiser
3932
3933        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
3934        - "analysis" (dict/file):
3935            Full analysis dictionnary parameters (see Exomiser docs).
3936            Either a dict, or a file in JSON or YAML format.
3937            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
3938            Default : None
3939        - "preset" (string):
3940            Analysis preset (available in config folder).
3941            Used if no full "analysis" is provided.
3942            Default: "exome"
3943        - "phenopacket" (dict/file):
3944            Samples and phenotipic features parameters (see Exomiser docs).
3945            Either a dict, or a file in JSON or YAML format.
3946            Default: None
3947        - "subject" (dict):
3948            Sample parameters (see Exomiser docs).
3949            Example:
3950                "subject":
3951                    {
3952                        "id": "ISDBM322017",
3953                        "sex": "FEMALE"
3954                    }
3955            Default: None
3956        - "sample" (string):
3957            Sample name to construct "subject" section:
3958                "subject":
3959                    {
3960                        "id": "<sample>",
3961                        "sex": "UNKNOWN_SEX"
3962                    }
3963            Default: None
3964        - "phenotypicFeatures" (dict)
3965            Phenotypic features to construct "subject" section.
3966            Example:
3967                "phenotypicFeatures":
3968                    [
3969                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
3970                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
3971                    ]
3972        - "hpo" (list)
3973            List of HPO ids as phenotypic features.
3974            Example:
3975                "hpo": ['0001156', '0001363', '0011304', '0010055']
3976            Default: []
3977        - "outputOptions" (dict):
3978            Output options (see Exomiser docs).
3979            Default:
3980                "output_options" =
3981                    {
3982                        "outputContributingVariantsOnly": False,
3983                        "numGenes": 0,
3984                        "outputFormats": ["TSV_VARIANT", "VCF"]
3985                    }
3986        - "transcript_source" (string):
3987            Transcript source (either "refseq", "ucsc", "ensembl")
3988            Default: "refseq"
3989        - "exomiser_to_info" (boolean):
3990            Add exomiser TSV file columns as INFO fields in VCF.
3991            Default: False
3992        - "release" (string):
3993            Exomise database release.
3994            If not exists, database release will be downloaded (take a while).
3995            Default: None (provided by application.properties configuration file)
3996        - "exomiser_application_properties" (file):
3997            Exomiser configuration file (see Exomiser docs).
3998            Useful to automatically download databases (especially for specific genome databases).
3999
4000        Notes:
4001        - If no sample in parameters, first sample in VCF will be chosen
4002        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
4003
4004        :param threads: The number of threads to use
4005        :return: None.
4006        """
4007
4008        # DEBUG
4009        log.debug("Start annotation with Exomiser databases")
4010
4011        # Threads
4012        if not threads:
4013            threads = self.get_threads()
4014        log.debug("Threads: " + str(threads))
4015
4016        # Config
4017        config = self.get_config()
4018        log.debug("Config: " + str(config))
4019
4020        # Config - Folders - Databases
4021        databases_folders = (
4022            config.get("folders", {})
4023            .get("databases", {})
4024            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
4025        )
4026        databases_folders = full_path(databases_folders)
4027        if not os.path.exists(databases_folders):
4028            log.error(f"Databases annotations: {databases_folders} NOT found")
4029        log.debug("Databases annotations: " + str(databases_folders))
4030
4031        # Config - Exomiser
4032        exomiser_bin_command = get_bin_command(
4033            bin="exomiser-cli*.jar",
4034            tool="exomiser",
4035            bin_type="jar",
4036            config=config,
4037            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
4038        )
4039        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
4040        if not exomiser_bin_command:
4041            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
4042            log.error(msg_err)
4043            raise ValueError(msg_err)
4044
4045        # Param
4046        param = self.get_param()
4047        log.debug("Param: " + str(param))
4048
4049        # Param - Exomiser
4050        param_exomiser = param.get("annotation", {}).get("exomiser", {})
4051        log.debug(f"Param Exomiser: {param_exomiser}")
4052
4053        # Param - Assembly
4054        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4055        log.debug("Assembly: " + str(assembly))
4056
4057        # Data
4058        table_variants = self.get_table_variants()
4059
4060        # Check if not empty
4061        log.debug("Check if not empty")
4062        sql_query_chromosomes = (
4063            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4064        )
4065        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4066            log.info(f"VCF empty")
4067            return False
4068
4069        # VCF header
4070        vcf_reader = self.get_header()
4071        log.debug("Initial header: " + str(vcf_reader.infos))
4072
4073        # Samples
4074        samples = self.get_header_sample_list()
4075        if not samples:
4076            log.error("No Samples in VCF")
4077            return False
4078        log.debug(f"Samples: {samples}")
4079
4080        # Memory limit
4081        memory_limit = self.get_memory("8G")
4082        log.debug(f"memory_limit: {memory_limit}")
4083
4084        # Exomiser java options
4085        exomiser_java_options = (
4086            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4087        )
4088        log.debug(f"Exomiser java options: {exomiser_java_options}")
4089
4090        # Download Exomiser (if not exists)
4091        exomiser_release = param_exomiser.get("release", None)
4092        exomiser_application_properties = param_exomiser.get(
4093            "exomiser_application_properties", None
4094        )
4095        databases_download_exomiser(
4096            assemblies=[assembly],
4097            exomiser_folder=databases_folders,
4098            exomiser_release=exomiser_release,
4099            exomiser_phenotype_release=exomiser_release,
4100            exomiser_application_properties=exomiser_application_properties,
4101        )
4102
4103        # Force annotation
4104        force_update_annotation = True
4105
4106        if "Exomiser" not in self.get_header().infos or force_update_annotation:
4107            log.debug("Start annotation Exomiser")
4108
4109            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
4110
4111                # tmp_dir = "/tmp/exomiser"
4112
4113                ### ANALYSIS ###
4114                ################
4115
4116                # Create analysis.json through analysis dict
4117                # either analysis in param or by default
4118                # depending on preset exome/genome)
4119
4120                # Init analysis dict
4121                param_exomiser_analysis_dict = {}
4122
4123                # analysis from param
4124                param_exomiser_analysis = param_exomiser.get("analysis", {})
4125                param_exomiser_analysis = full_path(param_exomiser_analysis)
4126
4127                # If analysis in param -> load anlaysis json
4128                if param_exomiser_analysis:
4129
4130                    # If param analysis is a file and exists
4131                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
4132                        param_exomiser_analysis
4133                    ):
4134                        # Load analysis file into analysis dict (either yaml or json)
4135                        with open(param_exomiser_analysis) as json_file:
4136                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
4137
4138                    # If param analysis is a dict
4139                    elif isinstance(param_exomiser_analysis, dict):
4140                        # Load analysis dict into analysis dict (either yaml or json)
4141                        param_exomiser_analysis_dict = param_exomiser_analysis
4142
4143                    # Error analysis type
4144                    else:
4145                        log.error(f"Analysis type unknown. Check param file.")
4146                        raise ValueError(f"Analysis type unknown. Check param file.")
4147
4148                # Case no input analysis config file/dict
4149                # Use preset (exome/genome) to open default config file
4150                if not param_exomiser_analysis_dict:
4151
4152                    # default preset
4153                    default_preset = "exome"
4154
4155                    # Get param preset or default preset
4156                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
4157
4158                    # Try to find if preset is a file
4159                    if os.path.exists(param_exomiser_preset):
4160                        # Preset file is provided in full path
4161                        param_exomiser_analysis_default_config_file = (
4162                            param_exomiser_preset
4163                        )
4164                    # elif os.path.exists(full_path(param_exomiser_preset)):
4165                    #     # Preset file is provided in full path
4166                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
4167                    elif os.path.exists(
4168                        os.path.join(folder_config, param_exomiser_preset)
4169                    ):
4170                        # Preset file is provided a basename in config folder (can be a path with subfolders)
4171                        param_exomiser_analysis_default_config_file = os.path.join(
4172                            folder_config, param_exomiser_preset
4173                        )
4174                    else:
4175                        # Construct preset file
4176                        param_exomiser_analysis_default_config_file = os.path.join(
4177                            folder_config,
4178                            f"preset-{param_exomiser_preset}-analysis.json",
4179                        )
4180
4181                    # If preset file exists
4182                    param_exomiser_analysis_default_config_file = full_path(
4183                        param_exomiser_analysis_default_config_file
4184                    )
4185                    if os.path.exists(param_exomiser_analysis_default_config_file):
4186                        # Load prest file into analysis dict (either yaml or json)
4187                        with open(
4188                            param_exomiser_analysis_default_config_file
4189                        ) as json_file:
4190                            # param_exomiser_analysis_dict[""] = json.load(json_file)
4191                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
4192                                json_file
4193                            )
4194
4195                    # Error preset file
4196                    else:
4197                        log.error(
4198                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4199                        )
4200                        raise ValueError(
4201                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4202                        )
4203
4204                # If no analysis dict created
4205                if not param_exomiser_analysis_dict:
4206                    log.error(f"No analysis config")
4207                    raise ValueError(f"No analysis config")
4208
4209                # Log
4210                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4211
4212                ### PHENOPACKET ###
4213                ###################
4214
4215                # If no PhenoPacket in analysis dict -> check in param
4216                if "phenopacket" not in param_exomiser_analysis_dict:
4217
4218                    # If PhenoPacket in param -> load anlaysis json
4219                    if param_exomiser.get("phenopacket", None):
4220
4221                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
4222                        param_exomiser_phenopacket = full_path(
4223                            param_exomiser_phenopacket
4224                        )
4225
4226                        # If param phenopacket is a file and exists
4227                        if isinstance(
4228                            param_exomiser_phenopacket, str
4229                        ) and os.path.exists(param_exomiser_phenopacket):
4230                            # Load phenopacket file into analysis dict (either yaml or json)
4231                            with open(param_exomiser_phenopacket) as json_file:
4232                                param_exomiser_analysis_dict["phenopacket"] = (
4233                                    yaml.safe_load(json_file)
4234                                )
4235
4236                        # If param phenopacket is a dict
4237                        elif isinstance(param_exomiser_phenopacket, dict):
4238                            # Load phenopacket dict into analysis dict (either yaml or json)
4239                            param_exomiser_analysis_dict["phenopacket"] = (
4240                                param_exomiser_phenopacket
4241                            )
4242
4243                        # Error phenopacket type
4244                        else:
4245                            log.error(f"Phenopacket type unknown. Check param file.")
4246                            raise ValueError(
4247                                f"Phenopacket type unknown. Check param file."
4248                            )
4249
4250                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
4251                if "phenopacket" not in param_exomiser_analysis_dict:
4252
4253                    # Init PhenoPacket
4254                    param_exomiser_analysis_dict["phenopacket"] = {
4255                        "id": "analysis",
4256                        "proband": {},
4257                    }
4258
4259                    ### Add subject ###
4260
4261                    # If subject exists
4262                    param_exomiser_subject = param_exomiser.get("subject", {})
4263
4264                    # If subject not exists -> found sample ID
4265                    if not param_exomiser_subject:
4266
4267                        # Found sample ID in param
4268                        sample = param_exomiser.get("sample", None)
4269
4270                        # Find sample ID (first sample)
4271                        if not sample:
4272                            sample_list = self.get_header_sample_list()
4273                            if len(sample_list) > 0:
4274                                sample = sample_list[0]
4275                            else:
4276                                log.error(f"No sample found")
4277                                raise ValueError(f"No sample found")
4278
4279                        # Create subject
4280                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
4281
4282                    # Add to dict
4283                    param_exomiser_analysis_dict["phenopacket"][
4284                        "subject"
4285                    ] = param_exomiser_subject
4286
4287                    ### Add "phenotypicFeatures" ###
4288
4289                    # If phenotypicFeatures exists
4290                    param_exomiser_phenotypicfeatures = param_exomiser.get(
4291                        "phenotypicFeatures", []
4292                    )
4293
4294                    # If phenotypicFeatures not exists -> Try to infer from hpo list
4295                    if not param_exomiser_phenotypicfeatures:
4296
4297                        # Found HPO in param
4298                        param_exomiser_hpo = param_exomiser.get("hpo", [])
4299
4300                        # Split HPO if list in string format separated by comma
4301                        if isinstance(param_exomiser_hpo, str):
4302                            param_exomiser_hpo = param_exomiser_hpo.split(",")
4303
4304                        # Create HPO list
4305                        for hpo in param_exomiser_hpo:
4306                            hpo_clean = re.sub("[^0-9]", "", hpo)
4307                            param_exomiser_phenotypicfeatures.append(
4308                                {
4309                                    "type": {
4310                                        "id": f"HP:{hpo_clean}",
4311                                        "label": f"HP:{hpo_clean}",
4312                                    }
4313                                }
4314                            )
4315
4316                    # Add to dict
4317                    param_exomiser_analysis_dict["phenopacket"][
4318                        "phenotypicFeatures"
4319                    ] = param_exomiser_phenotypicfeatures
4320
4321                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
4322                    if not param_exomiser_phenotypicfeatures:
4323                        for step in param_exomiser_analysis_dict.get(
4324                            "analysis", {}
4325                        ).get("steps", []):
4326                            if "hiPhivePrioritiser" in step:
4327                                param_exomiser_analysis_dict.get("analysis", {}).get(
4328                                    "steps", []
4329                                ).remove(step)
4330
4331                ### Add Input File ###
4332
4333                # Initial file name and htsFiles
4334                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
4335                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
4336                    {
4337                        "uri": tmp_vcf_name,
4338                        "htsFormat": "VCF",
4339                        "genomeAssembly": assembly,
4340                    }
4341                ]
4342
4343                ### Add metaData ###
4344
4345                # If metaData not in analysis dict
4346                if "metaData" not in param_exomiser_analysis_dict:
4347                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
4348                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
4349                        "createdBy": "howard",
4350                        "phenopacketSchemaVersion": 1,
4351                    }
4352
4353                ### OutputOptions ###
4354
4355                # Init output result folder
4356                output_results = os.path.join(tmp_dir, "results")
4357
4358                # If no outputOptions in analysis dict
4359                if "outputOptions" not in param_exomiser_analysis_dict:
4360
4361                    # default output formats
4362                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
4363
4364                    # Get outputOptions in param
4365                    output_options = param_exomiser.get("outputOptions", None)
4366
4367                    # If no output_options in param -> check
4368                    if not output_options:
4369                        output_options = {
4370                            "outputContributingVariantsOnly": False,
4371                            "numGenes": 0,
4372                            "outputFormats": defaut_output_formats,
4373                        }
4374
4375                    # Replace outputDirectory in output options
4376                    output_options["outputDirectory"] = output_results
4377                    output_options["outputFileName"] = "howard"
4378
4379                    # Add outputOptions in analysis dict
4380                    param_exomiser_analysis_dict["outputOptions"] = output_options
4381
4382                else:
4383
4384                    # Replace output_results and output format (if exists in param)
4385                    param_exomiser_analysis_dict["outputOptions"][
4386                        "outputDirectory"
4387                    ] = output_results
4388                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
4389                        list(
4390                            set(
4391                                param_exomiser_analysis_dict.get(
4392                                    "outputOptions", {}
4393                                ).get("outputFormats", [])
4394                                + ["TSV_VARIANT", "VCF"]
4395                            )
4396                        )
4397                    )
4398
4399                # log
4400                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4401
4402                ### ANALYSIS FILE ###
4403                #####################
4404
4405                ### Full JSON analysis config file ###
4406
4407                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
4408                with open(exomiser_analysis, "w") as fp:
4409                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
4410
4411                ### SPLIT analysis and sample config files
4412
4413                # Splitted analysis dict
4414                param_exomiser_analysis_dict_for_split = (
4415                    param_exomiser_analysis_dict.copy()
4416                )
4417
4418                # Phenopacket JSON file
4419                exomiser_analysis_phenopacket = os.path.join(
4420                    tmp_dir, "analysis_phenopacket.json"
4421                )
4422                with open(exomiser_analysis_phenopacket, "w") as fp:
4423                    json.dump(
4424                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
4425                        fp,
4426                        indent=4,
4427                    )
4428
4429                # Analysis JSON file without Phenopacket parameters
4430                param_exomiser_analysis_dict_for_split.pop("phenopacket")
4431                exomiser_analysis_analysis = os.path.join(
4432                    tmp_dir, "analysis_analysis.json"
4433                )
4434                with open(exomiser_analysis_analysis, "w") as fp:
4435                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
4436
4437                ### INITAL VCF file ###
4438                #######################
4439
4440                ### Create list of samples to use and include inti initial VCF file ####
4441
4442                # Subject (main sample)
4443                # Get sample ID in analysis dict
4444                sample_subject = (
4445                    param_exomiser_analysis_dict.get("phenopacket", {})
4446                    .get("subject", {})
4447                    .get("id", None)
4448                )
4449                sample_proband = (
4450                    param_exomiser_analysis_dict.get("phenopacket", {})
4451                    .get("proband", {})
4452                    .get("subject", {})
4453                    .get("id", None)
4454                )
4455                sample = []
4456                if sample_subject:
4457                    sample.append(sample_subject)
4458                if sample_proband:
4459                    sample.append(sample_proband)
4460
4461                # Get sample ID within Pedigree
4462                pedigree_persons_list = (
4463                    param_exomiser_analysis_dict.get("phenopacket", {})
4464                    .get("pedigree", {})
4465                    .get("persons", {})
4466                )
4467
4468                # Create list with all sample ID in pedigree (if exists)
4469                pedigree_persons = []
4470                for person in pedigree_persons_list:
4471                    pedigree_persons.append(person.get("individualId"))
4472
4473                # Concat subject sample ID and samples ID in pedigreesamples
4474                samples = list(set(sample + pedigree_persons))
4475
4476                # Check if sample list is not empty
4477                if not samples:
4478                    log.error(f"No samples found")
4479                    raise ValueError(f"No samples found")
4480
4481                # Create VCF with sample (either sample in param or first one by default)
4482                # Export VCF file
4483                self.export_variant_vcf(
4484                    vcf_file=tmp_vcf_name,
4485                    remove_info=True,
4486                    add_samples=True,
4487                    list_samples=samples,
4488                    index=False,
4489                )
4490
4491                ### Execute Exomiser ###
4492                ########################
4493
4494                # Init command
4495                exomiser_command = ""
4496
4497                # Command exomiser options
4498                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
4499
4500                # Release
4501                exomiser_release = param_exomiser.get("release", None)
4502                if exomiser_release:
4503                    # phenotype data version
4504                    exomiser_options += (
4505                        f" --exomiser.phenotype.data-version={exomiser_release} "
4506                    )
4507                    # data version
4508                    exomiser_options += (
4509                        f" --exomiser.{assembly}.data-version={exomiser_release} "
4510                    )
4511                    # variant white list
4512                    variant_white_list_file = (
4513                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
4514                    )
4515                    if os.path.exists(
4516                        os.path.join(
4517                            databases_folders, assembly, variant_white_list_file
4518                        )
4519                    ):
4520                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
4521
4522                # transcript_source
4523                transcript_source = param_exomiser.get(
4524                    "transcript_source", None
4525                )  # ucsc, refseq, ensembl
4526                if transcript_source:
4527                    exomiser_options += (
4528                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
4529                    )
4530
4531                # If analysis contain proband param
4532                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
4533                    "proband", {}
4534                ):
4535                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
4536
4537                # If no proband (usually uniq sample)
4538                else:
4539                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
4540
4541                # Log
4542                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
4543
4544                # Run command
4545                result = subprocess.call(
4546                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
4547                )
4548                if result:
4549                    log.error("Exomiser command failed")
4550                    raise ValueError("Exomiser command failed")
4551
4552                ### RESULTS ###
4553                ###############
4554
4555                ### Annotate with TSV fields ###
4556
4557                # Init result tsv file
4558                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
4559
4560                # Init result tsv file
4561                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
4562
4563                # Parse TSV file and explode columns in INFO field
4564                if exomiser_to_info and os.path.exists(output_results_tsv):
4565
4566                    # Log
4567                    log.debug("Exomiser columns to VCF INFO field")
4568
4569                    # Retrieve columns and types
4570                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
4571                    output_results_tsv_df = self.get_query_to_df(query)
4572                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
4573
4574                    # Init concat fields for update
4575                    sql_query_update_concat_fields = []
4576
4577                    # Fields to avoid
4578                    fields_to_avoid = [
4579                        "CONTIG",
4580                        "START",
4581                        "END",
4582                        "REF",
4583                        "ALT",
4584                        "QUAL",
4585                        "FILTER",
4586                        "GENOTYPE",
4587                    ]
4588
4589                    # List all columns to add into header
4590                    for header_column in output_results_tsv_columns:
4591
4592                        # If header column is enable
4593                        if header_column not in fields_to_avoid:
4594
4595                            # Header info type
4596                            header_info_type = "String"
4597                            header_column_df = output_results_tsv_df[header_column]
4598                            header_column_df_dtype = header_column_df.dtype
4599                            if header_column_df_dtype == object:
4600                                if (
4601                                    pd.to_numeric(header_column_df, errors="coerce")
4602                                    .notnull()
4603                                    .all()
4604                                ):
4605                                    header_info_type = "Float"
4606                            else:
4607                                header_info_type = "Integer"
4608
4609                            # Header info
4610                            characters_to_validate = ["-"]
4611                            pattern = "[" + "".join(characters_to_validate) + "]"
4612                            header_info_name = re.sub(
4613                                pattern,
4614                                "_",
4615                                f"Exomiser_{header_column}".replace("#", ""),
4616                            )
4617                            header_info_number = "."
4618                            header_info_description = (
4619                                f"Exomiser {header_column} annotation"
4620                            )
4621                            header_info_source = "Exomiser"
4622                            header_info_version = "unknown"
4623                            header_info_code = CODE_TYPE_MAP[header_info_type]
4624                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
4625                                header_info_name,
4626                                header_info_number,
4627                                header_info_type,
4628                                header_info_description,
4629                                header_info_source,
4630                                header_info_version,
4631                                header_info_code,
4632                            )
4633
4634                            # Add field to add for update to concat fields
4635                            sql_query_update_concat_fields.append(
4636                                f"""
4637                                CASE
4638                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
4639                                    THEN concat(
4640                                        '{header_info_name}=',
4641                                        table_parquet."{header_column}",
4642                                        ';'
4643                                        )
4644
4645                                    ELSE ''
4646                                END
4647                            """
4648                            )
4649
4650                    # Update query
4651                    sql_query_update = f"""
4652                        UPDATE {table_variants} as table_variants
4653                            SET INFO = concat(
4654                                            CASE
4655                                                WHEN INFO NOT IN ('', '.')
4656                                                THEN INFO
4657                                                ELSE ''
4658                                            END,
4659                                            CASE
4660                                                WHEN table_variants.INFO NOT IN ('','.')
4661                                                THEN ';'
4662                                                ELSE ''
4663                                            END,
4664                                            (
4665                                            SELECT 
4666                                                concat(
4667                                                    {",".join(sql_query_update_concat_fields)}
4668                                                )
4669                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
4670                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
4671                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
4672                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
4673                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
4674                                            )
4675                                        )
4676                            ;
4677                        """
4678
4679                    # Update
4680                    self.conn.execute(sql_query_update)
4681
4682                ### Annotate with VCF INFO field ###
4683
4684                # Init result VCF file
4685                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
4686
4687                # If VCF exists
4688                if os.path.exists(output_results_vcf):
4689
4690                    # Log
4691                    log.debug("Exomiser result VCF update variants")
4692
4693                    # Find Exomiser INFO field annotation in header
4694                    with gzip.open(output_results_vcf, "rt") as f:
4695                        header_list = self.read_vcf_header(f)
4696                    exomiser_vcf_header = vcf.Reader(
4697                        io.StringIO("\n".join(header_list))
4698                    )
4699
4700                    # Add annotation INFO field to header
4701                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
4702
4703                    # Update variants with VCF
4704                    self.update_from_vcf(output_results_vcf)
4705
4706        return True
4707
4708    def annotation_snpeff(self, threads: int = None) -> None:
4709        """
4710        This function annotate with snpEff
4711
4712        :param threads: The number of threads to use
4713        :return: the value of the variable "return_value".
4714        """
4715
4716        # DEBUG
4717        log.debug("Start annotation with snpeff databases")
4718
4719        # Threads
4720        if not threads:
4721            threads = self.get_threads()
4722        log.debug("Threads: " + str(threads))
4723
4724        # DEBUG
4725        delete_tmp = True
4726        if self.get_config().get("verbosity", "warning") in ["debug"]:
4727            delete_tmp = False
4728            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4729
4730        # Config
4731        config = self.get_config()
4732        log.debug("Config: " + str(config))
4733
4734        # Config - Folders - Databases
4735        databases_folders = (
4736            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
4737        )
4738        log.debug("Databases annotations: " + str(databases_folders))
4739
4740        # # Config - Java
4741        # java_bin = get_bin(
4742        #     tool="java",
4743        #     bin="java",
4744        #     bin_type="bin",
4745        #     config=config,
4746        #     default_folder="/usr/bin",
4747        # )
4748        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
4749        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
4750        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
4751
4752        # # Config - snpEff bin
4753        # snpeff_jar = get_bin(
4754        #     tool="snpeff",
4755        #     bin="snpEff.jar",
4756        #     bin_type="jar",
4757        #     config=config,
4758        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4759        # )
4760        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
4761        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4762        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4763
4764        # Config - snpEff bin command
4765        snpeff_bin_command = get_bin_command(
4766            bin="snpEff.jar",
4767            tool="snpeff",
4768            bin_type="jar",
4769            config=config,
4770            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4771        )
4772        if not snpeff_bin_command:
4773            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
4774            log.error(msg_err)
4775            raise ValueError(msg_err)
4776
4777        # Config - snpEff databases
4778        snpeff_databases = (
4779            config.get("folders", {})
4780            .get("databases", {})
4781            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
4782        )
4783        snpeff_databases = full_path(snpeff_databases)
4784        if snpeff_databases is not None and snpeff_databases != "":
4785            log.debug(f"Create snpEff databases folder")
4786            if not os.path.exists(snpeff_databases):
4787                os.makedirs(snpeff_databases)
4788
4789        # Param
4790        param = self.get_param()
4791        log.debug("Param: " + str(param))
4792
4793        # Param
4794        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
4795        log.debug("Options: " + str(options))
4796
4797        # Param - Assembly
4798        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4799
4800        # Param - Options
4801        snpeff_options = (
4802            param.get("annotation", {}).get("snpeff", {}).get("options", "")
4803        )
4804        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
4805        snpeff_csvstats = (
4806            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
4807        )
4808        if snpeff_stats:
4809            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
4810            snpeff_stats = full_path(snpeff_stats)
4811            snpeff_options += f" -stats {snpeff_stats}"
4812        if snpeff_csvstats:
4813            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
4814            snpeff_csvstats = full_path(snpeff_csvstats)
4815            snpeff_options += f" -csvStats {snpeff_csvstats}"
4816
4817        # Data
4818        table_variants = self.get_table_variants()
4819
4820        # Check if not empty
4821        log.debug("Check if not empty")
4822        sql_query_chromosomes = (
4823            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4824        )
4825        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
4826        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4827            log.info(f"VCF empty")
4828            return
4829
4830        # Export in VCF
4831        log.debug("Create initial file to annotate")
4832        tmp_vcf = NamedTemporaryFile(
4833            prefix=self.get_prefix(),
4834            dir=self.get_tmp_dir(),
4835            suffix=".vcf.gz",
4836            delete=True,
4837        )
4838        tmp_vcf_name = tmp_vcf.name
4839
4840        # VCF header
4841        vcf_reader = self.get_header()
4842        log.debug("Initial header: " + str(vcf_reader.infos))
4843
4844        # Existing annotations
4845        for vcf_annotation in self.get_header().infos:
4846
4847            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4848            log.debug(
4849                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4850            )
4851
4852        # Memory limit
4853        # if config.get("memory", None):
4854        #     memory_limit = config.get("memory", "8G")
4855        # else:
4856        #     memory_limit = "8G"
4857        memory_limit = self.get_memory("8G")
4858        log.debug(f"memory_limit: {memory_limit}")
4859
4860        # snpEff java options
4861        snpeff_java_options = (
4862            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4863        )
4864        log.debug(f"Exomiser java options: {snpeff_java_options}")
4865
4866        force_update_annotation = True
4867
4868        if "ANN" not in self.get_header().infos or force_update_annotation:
4869
4870            # Check snpEff database
4871            log.debug(f"Check snpEff databases {[assembly]}")
4872            databases_download_snpeff(
4873                folder=snpeff_databases, assemblies=[assembly], config=config
4874            )
4875
4876            # Export VCF file
4877            self.export_variant_vcf(
4878                vcf_file=tmp_vcf_name,
4879                remove_info=True,
4880                add_samples=False,
4881                index=True,
4882            )
4883
4884            # Tmp file
4885            err_files = []
4886            tmp_annotate_vcf = NamedTemporaryFile(
4887                prefix=self.get_prefix(),
4888                dir=self.get_tmp_dir(),
4889                suffix=".vcf",
4890                delete=False,
4891            )
4892            tmp_annotate_vcf_name = tmp_annotate_vcf.name
4893            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
4894            err_files.append(tmp_annotate_vcf_name_err)
4895
4896            # Command
4897            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
4898            log.debug(f"Annotation - snpEff command: {snpeff_command}")
4899            run_parallel_commands([snpeff_command], 1)
4900
4901            # Error messages
4902            log.info(f"Error/Warning messages:")
4903            error_message_command_all = []
4904            error_message_command_warning = []
4905            error_message_command_err = []
4906            for err_file in err_files:
4907                with open(err_file, "r") as f:
4908                    for line in f:
4909                        message = line.strip()
4910                        error_message_command_all.append(message)
4911                        if line.startswith("[W::"):
4912                            error_message_command_warning.append(message)
4913                        if line.startswith("[E::"):
4914                            error_message_command_err.append(f"{err_file}: " + message)
4915            # log info
4916            for message in list(
4917                set(error_message_command_err + error_message_command_warning)
4918            ):
4919                log.info(f"   {message}")
4920            # debug info
4921            for message in list(set(error_message_command_all)):
4922                log.debug(f"   {message}")
4923            # failed
4924            if len(error_message_command_err):
4925                log.error("Annotation failed: Error in commands")
4926                raise ValueError("Annotation failed: Error in commands")
4927
4928            # Find annotation in header
4929            with open(tmp_annotate_vcf_name, "rt") as f:
4930                header_list = self.read_vcf_header(f)
4931            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
4932
4933            for ann in annovar_vcf_header.infos:
4934                if ann not in self.get_header().infos:
4935                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
4936
4937            # Update variants
4938            log.info(f"Annotation - Updating...")
4939            self.update_from_vcf(tmp_annotate_vcf_name)
4940
4941        else:
4942            if "ANN" in self.get_header().infos:
4943                log.debug(f"Existing snpEff annotations in VCF")
4944            if force_update_annotation:
4945                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
4946
4947    def annotation_annovar(self, threads: int = None) -> None:
4948        """
4949        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
4950        annotations
4951
4952        :param threads: number of threads to use
4953        :return: the value of the variable "return_value".
4954        """
4955
4956        # DEBUG
4957        log.debug("Start annotation with Annovar databases")
4958
4959        # Threads
4960        if not threads:
4961            threads = self.get_threads()
4962        log.debug("Threads: " + str(threads))
4963
4964        # Tmp en Err files
4965        tmp_files = []
4966        err_files = []
4967
4968        # DEBUG
4969        delete_tmp = True
4970        if self.get_config().get("verbosity", "warning") in ["debug"]:
4971            delete_tmp = False
4972            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4973
4974        # Config
4975        config = self.get_config()
4976        log.debug("Config: " + str(config))
4977
4978        # Config - Folders - Databases
4979        databases_folders = (
4980            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
4981        )
4982        log.debug("Databases annotations: " + str(databases_folders))
4983
4984        # Config - annovar bin command
4985        annovar_bin_command = get_bin_command(
4986            bin="table_annovar.pl",
4987            tool="annovar",
4988            bin_type="perl",
4989            config=config,
4990            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
4991        )
4992        if not annovar_bin_command:
4993            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
4994            log.error(msg_err)
4995            raise ValueError(msg_err)
4996
4997        # Config - BCFTools bin command
4998        bcftools_bin_command = get_bin_command(
4999            bin="bcftools",
5000            tool="bcftools",
5001            bin_type="bin",
5002            config=config,
5003            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
5004        )
5005        if not bcftools_bin_command:
5006            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
5007            log.error(msg_err)
5008            raise ValueError(msg_err)
5009
5010        # Config - annovar databases
5011        annovar_databases = (
5012            config.get("folders", {})
5013            .get("databases", {})
5014            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
5015        )
5016        annovar_databases = full_path(annovar_databases)
5017        if annovar_databases != "" and not os.path.exists(annovar_databases):
5018            os.makedirs(annovar_databases)
5019
5020        # Param
5021        param = self.get_param()
5022        log.debug("Param: " + str(param))
5023
5024        # Param - options
5025        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
5026        log.debug("Options: " + str(options))
5027
5028        # Param - annotations
5029        annotations = (
5030            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
5031        )
5032        log.debug("Annotations: " + str(annotations))
5033
5034        # Param - Assembly
5035        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5036
5037        # Annovar database assembly
5038        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
5039        if annovar_databases_assembly != "" and not os.path.exists(
5040            annovar_databases_assembly
5041        ):
5042            os.makedirs(annovar_databases_assembly)
5043
5044        # Data
5045        table_variants = self.get_table_variants()
5046
5047        # Check if not empty
5048        log.debug("Check if not empty")
5049        sql_query_chromosomes = (
5050            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5051        )
5052        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
5053        if not sql_query_chromosomes_df["count"][0]:
5054            log.info(f"VCF empty")
5055            return
5056
5057        # VCF header
5058        vcf_reader = self.get_header()
5059        log.debug("Initial header: " + str(vcf_reader.infos))
5060
5061        # Existing annotations
5062        for vcf_annotation in self.get_header().infos:
5063
5064            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5065            log.debug(
5066                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5067            )
5068
5069        force_update_annotation = True
5070
5071        if annotations:
5072
5073            commands = []
5074            tmp_annotates_vcf_name_list = []
5075
5076            # Export in VCF
5077            log.debug("Create initial file to annotate")
5078            tmp_vcf = NamedTemporaryFile(
5079                prefix=self.get_prefix(),
5080                dir=self.get_tmp_dir(),
5081                suffix=".vcf.gz",
5082                delete=False,
5083            )
5084            tmp_vcf_name = tmp_vcf.name
5085            tmp_files.append(tmp_vcf_name)
5086            tmp_files.append(tmp_vcf_name + ".tbi")
5087
5088            # Export VCF file
5089            self.export_variant_vcf(
5090                vcf_file=tmp_vcf_name,
5091                remove_info=".",
5092                add_samples=False,
5093                index=True,
5094            )
5095
5096            # Create file for field rename
5097            log.debug("Create file for field rename")
5098            tmp_rename = NamedTemporaryFile(
5099                prefix=self.get_prefix(),
5100                dir=self.get_tmp_dir(),
5101                suffix=".rename",
5102                delete=False,
5103            )
5104            tmp_rename_name = tmp_rename.name
5105            tmp_files.append(tmp_rename_name)
5106
5107            # Check Annovar database
5108            log.debug(
5109                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
5110            )
5111            databases_download_annovar(
5112                folder=annovar_databases,
5113                files=list(annotations.keys()),
5114                assemblies=[assembly],
5115            )
5116
5117            for annotation in annotations:
5118                annotation_fields = annotations[annotation]
5119
5120                if not annotation_fields:
5121                    annotation_fields = {"INFO": None}
5122
5123                log.info(f"Annotations Annovar - database '{annotation}'")
5124                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
5125
5126                # Tmp file for annovar
5127                err_files = []
5128                tmp_annotate_vcf_directory = TemporaryDirectory(
5129                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
5130                )
5131                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
5132                tmp_annotate_vcf_name_annovar = (
5133                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
5134                )
5135                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
5136                err_files.append(tmp_annotate_vcf_name_err)
5137                tmp_files.append(tmp_annotate_vcf_name_err)
5138
5139                # Tmp file final vcf annotated by annovar
5140                tmp_annotate_vcf = NamedTemporaryFile(
5141                    prefix=self.get_prefix(),
5142                    dir=self.get_tmp_dir(),
5143                    suffix=".vcf.gz",
5144                    delete=False,
5145                )
5146                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5147                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
5148                tmp_files.append(tmp_annotate_vcf_name)
5149                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
5150
5151                # Number of fields
5152                annotation_list = []
5153                annotation_renamed_list = []
5154
5155                for annotation_field in annotation_fields:
5156
5157                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
5158                    annotation_fields_new_name = annotation_fields.get(
5159                        annotation_field, annotation_field
5160                    )
5161                    if not annotation_fields_new_name:
5162                        annotation_fields_new_name = annotation_field
5163
5164                    if (
5165                        force_update_annotation
5166                        or annotation_fields_new_name not in self.get_header().infos
5167                    ):
5168                        annotation_list.append(annotation_field)
5169                        annotation_renamed_list.append(annotation_fields_new_name)
5170                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
5171                        log.warning(
5172                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
5173                        )
5174
5175                    # Add rename info
5176                    run_parallel_commands(
5177                        [
5178                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
5179                        ],
5180                        1,
5181                    )
5182
5183                # log.debug("fields_to_removed: " + str(fields_to_removed))
5184                log.debug("annotation_list: " + str(annotation_list))
5185
5186                # protocol
5187                protocol = annotation
5188
5189                # argument
5190                argument = ""
5191
5192                # operation
5193                operation = "f"
5194                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
5195                    "ensGene"
5196                ):
5197                    operation = "g"
5198                    if options.get("genebase", None):
5199                        argument = f"""'{options.get("genebase","")}'"""
5200                elif annotation in ["cytoBand"]:
5201                    operation = "r"
5202
5203                # argument option
5204                argument_option = ""
5205                if argument != "":
5206                    argument_option = " --argument " + argument
5207
5208                # command options
5209                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
5210                for option in options:
5211                    if option not in ["genebase"]:
5212                        command_options += f""" --{option}={options[option]}"""
5213
5214                # Command
5215
5216                # Command - Annovar
5217                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
5218                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
5219
5220                # Command - start pipe
5221                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
5222
5223                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
5224                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
5225
5226                # Command - Special characters (refGene annotation)
5227                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
5228
5229                # Command - Clean empty fields (with value ".")
5230                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
5231
5232                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
5233                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
5234                if "ALL" not in annotation_list and "INFO" not in annotation_list:
5235                    # for ann in annotation_renamed_list:
5236                    for ann in annotation_list:
5237                        annovar_fields_to_keep.append(f"^INFO/{ann}")
5238
5239                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
5240
5241                # Command - indexing
5242                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
5243
5244                log.debug(f"Annotation - Annovar command: {command_annovar}")
5245                run_parallel_commands([command_annovar], 1)
5246
5247                # Error messages
5248                log.info(f"Error/Warning messages:")
5249                error_message_command_all = []
5250                error_message_command_warning = []
5251                error_message_command_err = []
5252                for err_file in err_files:
5253                    with open(err_file, "r") as f:
5254                        for line in f:
5255                            message = line.strip()
5256                            error_message_command_all.append(message)
5257                            if line.startswith("[W::") or line.startswith("WARNING"):
5258                                error_message_command_warning.append(message)
5259                            if line.startswith("[E::") or line.startswith("ERROR"):
5260                                error_message_command_err.append(
5261                                    f"{err_file}: " + message
5262                                )
5263                # log info
5264                for message in list(
5265                    set(error_message_command_err + error_message_command_warning)
5266                ):
5267                    log.info(f"   {message}")
5268                # debug info
5269                for message in list(set(error_message_command_all)):
5270                    log.debug(f"   {message}")
5271                # failed
5272                if len(error_message_command_err):
5273                    log.error("Annotation failed: Error in commands")
5274                    raise ValueError("Annotation failed: Error in commands")
5275
5276            if tmp_annotates_vcf_name_list:
5277
5278                # List of annotated files
5279                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
5280
5281                # Tmp file
5282                tmp_annotate_vcf = NamedTemporaryFile(
5283                    prefix=self.get_prefix(),
5284                    dir=self.get_tmp_dir(),
5285                    suffix=".vcf.gz",
5286                    delete=False,
5287                )
5288                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5289                tmp_files.append(tmp_annotate_vcf_name)
5290                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5291                err_files.append(tmp_annotate_vcf_name_err)
5292                tmp_files.append(tmp_annotate_vcf_name_err)
5293
5294                # Command merge
5295                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
5296                log.info(
5297                    f"Annotation Annovar - Annotation merging "
5298                    + str(len(tmp_annotates_vcf_name_list))
5299                    + " annotated files"
5300                )
5301                log.debug(f"Annotation - merge command: {merge_command}")
5302                run_parallel_commands([merge_command], 1)
5303
5304                # Find annotation in header
5305                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
5306                    header_list = self.read_vcf_header(f)
5307                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5308
5309                for ann in annovar_vcf_header.infos:
5310                    if ann not in self.get_header().infos:
5311                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5312
5313                # Update variants
5314                log.info(f"Annotation Annovar - Updating...")
5315                self.update_from_vcf(tmp_annotate_vcf_name)
5316
5317            # Clean files
5318            # Tmp file remove command
5319            if True:
5320                tmp_files_remove_command = ""
5321                if tmp_files:
5322                    tmp_files_remove_command = " ".join(tmp_files)
5323                clean_command = f" rm -f {tmp_files_remove_command} "
5324                log.debug(f"Annotation Annovar - Annotation cleaning ")
5325                log.debug(f"Annotation - cleaning command: {clean_command}")
5326                run_parallel_commands([clean_command], 1)
5327
5328    # Parquet
5329    def annotation_parquet(self, threads: int = None) -> None:
5330        """
5331        It takes a VCF file, and annotates it with a parquet file
5332
5333        :param threads: number of threads to use for the annotation
5334        :return: the value of the variable "result".
5335        """
5336
5337        # DEBUG
5338        log.debug("Start annotation with parquet databases")
5339
5340        # Threads
5341        if not threads:
5342            threads = self.get_threads()
5343        log.debug("Threads: " + str(threads))
5344
5345        # DEBUG
5346        delete_tmp = True
5347        if self.get_config().get("verbosity", "warning") in ["debug"]:
5348            delete_tmp = False
5349            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5350
5351        # Config
5352        databases_folders = set(
5353            self.get_config()
5354            .get("folders", {})
5355            .get("databases", {})
5356            .get("annotations", ["."])
5357            + self.get_config()
5358            .get("folders", {})
5359            .get("databases", {})
5360            .get("parquet", ["."])
5361        )
5362        log.debug("Databases annotations: " + str(databases_folders))
5363
5364        # Param
5365        annotations = (
5366            self.get_param()
5367            .get("annotation", {})
5368            .get("parquet", {})
5369            .get("annotations", None)
5370        )
5371        log.debug("Annotations: " + str(annotations))
5372
5373        # Assembly
5374        assembly = self.get_param().get(
5375            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
5376        )
5377
5378        # Force Update Annotation
5379        force_update_annotation = (
5380            self.get_param()
5381            .get("annotation", {})
5382            .get("options", {})
5383            .get("annotations_update", False)
5384        )
5385        log.debug(f"force_update_annotation={force_update_annotation}")
5386        force_append_annotation = (
5387            self.get_param()
5388            .get("annotation", {})
5389            .get("options", {})
5390            .get("annotations_append", False)
5391        )
5392        log.debug(f"force_append_annotation={force_append_annotation}")
5393
5394        # Data
5395        table_variants = self.get_table_variants()
5396
5397        # Check if not empty
5398        log.debug("Check if not empty")
5399        sql_query_chromosomes_df = self.get_query_to_df(
5400            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
5401        )
5402        if not sql_query_chromosomes_df["count"][0]:
5403            log.info(f"VCF empty")
5404            return
5405
5406        # VCF header
5407        vcf_reader = self.get_header()
5408        log.debug("Initial header: " + str(vcf_reader.infos))
5409
5410        # Nb Variants POS
5411        log.debug("NB Variants Start")
5412        nb_variants = self.conn.execute(
5413            f"SELECT count(*) AS count FROM variants"
5414        ).fetchdf()["count"][0]
5415        log.debug("NB Variants Stop")
5416
5417        # Existing annotations
5418        for vcf_annotation in self.get_header().infos:
5419
5420            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5421            log.debug(
5422                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5423            )
5424
5425        # Added columns
5426        added_columns = []
5427
5428        # drop indexes
5429        log.debug(f"Drop indexes...")
5430        self.drop_indexes()
5431
5432        if annotations:
5433
5434            if "ALL" in annotations:
5435
5436                all_param = annotations.get("ALL", {})
5437                all_param_formats = all_param.get("formats", None)
5438                all_param_releases = all_param.get("releases", None)
5439
5440                databases_infos_dict = self.scan_databases(
5441                    database_formats=all_param_formats,
5442                    database_releases=all_param_releases,
5443                )
5444                for database_infos in databases_infos_dict.keys():
5445                    if database_infos not in annotations:
5446                        annotations[database_infos] = {"INFO": None}
5447
5448            for annotation in annotations:
5449
5450                if annotation in ["ALL"]:
5451                    continue
5452
5453                # Annotation Name
5454                annotation_name = os.path.basename(annotation)
5455
5456                # Annotation fields
5457                annotation_fields = annotations[annotation]
5458                if not annotation_fields:
5459                    annotation_fields = {"INFO": None}
5460
5461                log.debug(f"Annotation '{annotation_name}'")
5462                log.debug(
5463                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
5464                )
5465
5466                # Create Database
5467                database = Database(
5468                    database=annotation,
5469                    databases_folders=databases_folders,
5470                    assembly=assembly,
5471                )
5472
5473                # Find files
5474                parquet_file = database.get_database()
5475                parquet_hdr_file = database.get_header_file()
5476                parquet_type = database.get_type()
5477
5478                # Check if files exists
5479                if not parquet_file or not parquet_hdr_file:
5480                    log.error("Annotation failed: file not found")
5481                    raise ValueError("Annotation failed: file not found")
5482                else:
5483                    # Get parquet connexion
5484                    parquet_sql_attach = database.get_sql_database_attach(
5485                        output="query"
5486                    )
5487                    if parquet_sql_attach:
5488                        self.conn.execute(parquet_sql_attach)
5489                    parquet_file_link = database.get_sql_database_link()
5490                    # Log
5491                    log.debug(
5492                        f"Annotation '{annotation_name}' - file: "
5493                        + str(parquet_file)
5494                        + " and "
5495                        + str(parquet_hdr_file)
5496                    )
5497
5498                    # Database full header columns
5499                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
5500                        parquet_hdr_file
5501                    )
5502                    # Log
5503                    log.debug(
5504                        "Annotation database header columns : "
5505                        + str(parquet_hdr_vcf_header_columns)
5506                    )
5507
5508                    # Load header as VCF object
5509                    parquet_hdr_vcf_header_infos = database.get_header().infos
5510                    # Log
5511                    log.debug(
5512                        "Annotation database header: "
5513                        + str(parquet_hdr_vcf_header_infos)
5514                    )
5515
5516                    # Get extra infos
5517                    parquet_columns = database.get_extra_columns()
5518                    # Log
5519                    log.debug("Annotation database Columns: " + str(parquet_columns))
5520
5521                    # Add extra columns if "ALL" in annotation_fields
5522                    # if "ALL" in annotation_fields:
5523                    #     allow_add_extra_column = True
5524                    if "ALL" in annotation_fields and database.get_extra_columns():
5525                        for extra_column in database.get_extra_columns():
5526                            if (
5527                                extra_column not in annotation_fields
5528                                and extra_column.replace("INFO/", "")
5529                                not in parquet_hdr_vcf_header_infos
5530                            ):
5531                                parquet_hdr_vcf_header_infos[extra_column] = (
5532                                    vcf.parser._Info(
5533                                        extra_column,
5534                                        ".",
5535                                        "String",
5536                                        f"{extra_column} description",
5537                                        "unknown",
5538                                        "unknown",
5539                                        self.code_type_map["String"],
5540                                    )
5541                                )
5542
5543                    # For all fields in database
5544                    annotation_fields_all = False
5545                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
5546                        annotation_fields_all = True
5547                        annotation_fields = {
5548                            key: key for key in parquet_hdr_vcf_header_infos
5549                        }
5550
5551                        log.debug(
5552                            "Annotation database header - All annotations added: "
5553                            + str(annotation_fields)
5554                        )
5555
5556                    # Init
5557
5558                    # List of annotation fields to use
5559                    sql_query_annotation_update_info_sets = []
5560
5561                    # List of annotation to agregate
5562                    sql_query_annotation_to_agregate = []
5563
5564                    # Number of fields
5565                    nb_annotation_field = 0
5566
5567                    # Annotation fields processed
5568                    annotation_fields_processed = []
5569
5570                    # Columns mapping
5571                    map_columns = database.map_columns(
5572                        columns=annotation_fields, prefixes=["INFO/"]
5573                    )
5574
5575                    # Query dict for fields to remove (update option)
5576                    query_dict_remove = {}
5577
5578                    # Fetch Anotation fields
5579                    for annotation_field in annotation_fields:
5580
5581                        # annotation_field_column
5582                        annotation_field_column = map_columns.get(
5583                            annotation_field, "INFO"
5584                        )
5585
5586                        # field new name, if parametered
5587                        annotation_fields_new_name = annotation_fields.get(
5588                            annotation_field, annotation_field
5589                        )
5590                        if not annotation_fields_new_name:
5591                            annotation_fields_new_name = annotation_field
5592
5593                        # To annotate
5594                        # force_update_annotation = True
5595                        # force_append_annotation = True
5596                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
5597                        if annotation_field in parquet_hdr_vcf_header_infos and (
5598                            force_update_annotation
5599                            or force_append_annotation
5600                            or (
5601                                annotation_fields_new_name
5602                                not in self.get_header().infos
5603                            )
5604                        ):
5605
5606                            # Add field to annotation to process list
5607                            annotation_fields_processed.append(
5608                                annotation_fields_new_name
5609                            )
5610
5611                            # explode infos for the field
5612                            annotation_fields_new_name_info_msg = ""
5613                            if (
5614                                force_update_annotation
5615                                and annotation_fields_new_name
5616                                in self.get_header().infos
5617                            ):
5618                                # Remove field from INFO
5619                                query = f"""
5620                                    UPDATE {table_variants} as table_variants
5621                                    SET INFO = REGEXP_REPLACE(
5622                                                concat(table_variants.INFO,''),
5623                                                ';*{annotation_fields_new_name}=[^;]*',
5624                                                ''
5625                                                )
5626                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
5627                                """
5628                                annotation_fields_new_name_info_msg = " [update]"
5629                                query_dict_remove[
5630                                    f"remove 'INFO/{annotation_fields_new_name}'"
5631                                ] = query
5632
5633                            # Sep between fields in INFO
5634                            nb_annotation_field += 1
5635                            if nb_annotation_field > 1:
5636                                annotation_field_sep = ";"
5637                            else:
5638                                annotation_field_sep = ""
5639
5640                            log.info(
5641                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
5642                            )
5643
5644                            # Add INFO field to header
5645                            parquet_hdr_vcf_header_infos_number = (
5646                                parquet_hdr_vcf_header_infos[annotation_field].num
5647                                or "."
5648                            )
5649                            parquet_hdr_vcf_header_infos_type = (
5650                                parquet_hdr_vcf_header_infos[annotation_field].type
5651                                or "String"
5652                            )
5653                            parquet_hdr_vcf_header_infos_description = (
5654                                parquet_hdr_vcf_header_infos[annotation_field].desc
5655                                or f"{annotation_field} description"
5656                            )
5657                            parquet_hdr_vcf_header_infos_source = (
5658                                parquet_hdr_vcf_header_infos[annotation_field].source
5659                                or "unknown"
5660                            )
5661                            parquet_hdr_vcf_header_infos_version = (
5662                                parquet_hdr_vcf_header_infos[annotation_field].version
5663                                or "unknown"
5664                            )
5665
5666                            vcf_reader.infos[annotation_fields_new_name] = (
5667                                vcf.parser._Info(
5668                                    annotation_fields_new_name,
5669                                    parquet_hdr_vcf_header_infos_number,
5670                                    parquet_hdr_vcf_header_infos_type,
5671                                    parquet_hdr_vcf_header_infos_description,
5672                                    parquet_hdr_vcf_header_infos_source,
5673                                    parquet_hdr_vcf_header_infos_version,
5674                                    self.code_type_map[
5675                                        parquet_hdr_vcf_header_infos_type
5676                                    ],
5677                                )
5678                            )
5679
5680                            # Append
5681                            if force_append_annotation:
5682                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
5683                            else:
5684                                query_case_when_append = ""
5685
5686                            # Annotation/Update query fields
5687                            # Found in INFO column
5688                            if (
5689                                annotation_field_column == "INFO"
5690                                and "INFO" in parquet_hdr_vcf_header_columns
5691                            ):
5692                                sql_query_annotation_update_info_sets.append(
5693                                    f"""
5694                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
5695                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
5696                                        ELSE ''
5697                                    END
5698                                """
5699                                )
5700                            # Found in a specific column
5701                            else:
5702                                sql_query_annotation_update_info_sets.append(
5703                                    f"""
5704                                CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append}
5705                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ','))
5706                                        ELSE ''
5707                                    END
5708                                """
5709                                )
5710                                sql_query_annotation_to_agregate.append(
5711                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
5712                                )
5713
5714                        # Not to annotate
5715                        else:
5716
5717                            if force_update_annotation:
5718                                annotation_message = "forced"
5719                            else:
5720                                annotation_message = "skipped"
5721
5722                            if annotation_field not in parquet_hdr_vcf_header_infos:
5723                                log.warning(
5724                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
5725                                )
5726                            if annotation_fields_new_name in self.get_header().infos:
5727                                log.warning(
5728                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
5729                                )
5730
5731                    # Check if ALL fields have to be annotated. Thus concat all INFO field
5732                    # allow_annotation_full_info = True
5733                    allow_annotation_full_info = not force_append_annotation
5734
5735                    if parquet_type in ["regions"]:
5736                        allow_annotation_full_info = False
5737
5738                    if (
5739                        allow_annotation_full_info
5740                        and nb_annotation_field == len(annotation_fields)
5741                        and annotation_fields_all
5742                        and (
5743                            "INFO" in parquet_hdr_vcf_header_columns
5744                            and "INFO" in database.get_extra_columns()
5745                        )
5746                    ):
5747                        log.debug("Column INFO annotation enabled")
5748                        sql_query_annotation_update_info_sets = []
5749                        sql_query_annotation_update_info_sets.append(
5750                            f" table_parquet.INFO "
5751                        )
5752
5753                    if sql_query_annotation_update_info_sets:
5754
5755                        # Annotate
5756                        log.info(f"Annotation '{annotation_name}' - Annotation...")
5757
5758                        # Join query annotation update info sets for SQL
5759                        sql_query_annotation_update_info_sets_sql = ",".join(
5760                            sql_query_annotation_update_info_sets
5761                        )
5762
5763                        # Check chromosomes list (and variants infos)
5764                        sql_query_chromosomes = f"""
5765                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
5766                            FROM {table_variants} as table_variants
5767                            GROUP BY table_variants."#CHROM"
5768                            ORDER BY table_variants."#CHROM"
5769                            """
5770                        sql_query_chromosomes_df = self.conn.execute(
5771                            sql_query_chromosomes
5772                        ).df()
5773                        sql_query_chromosomes_dict = {
5774                            entry["CHROM"]: {
5775                                "count": entry["count_variants"],
5776                                "min": entry["min_variants"],
5777                                "max": entry["max_variants"],
5778                            }
5779                            for index, entry in sql_query_chromosomes_df.iterrows()
5780                        }
5781
5782                        # Init
5783                        nb_of_query = 0
5784                        nb_of_variant_annotated = 0
5785                        query_dict = query_dict_remove
5786
5787                        # for chrom in sql_query_chromosomes_df["CHROM"]:
5788                        for chrom in sql_query_chromosomes_dict:
5789
5790                            # Number of variant by chromosome
5791                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
5792                                chrom, {}
5793                            ).get("count", 0)
5794
5795                            log.debug(
5796                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
5797                            )
5798
5799                            # Annotation with regions database
5800                            if parquet_type in ["regions"]:
5801                                sql_query_annotation_from_clause = f"""
5802                                    FROM (
5803                                        SELECT 
5804                                            '{chrom}' AS \"#CHROM\",
5805                                            table_variants_from.\"POS\" AS \"POS\",
5806                                            {",".join(sql_query_annotation_to_agregate)}
5807                                        FROM {table_variants} as table_variants_from
5808                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
5809                                            table_parquet_from."#CHROM" = '{chrom}'
5810                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
5811                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
5812                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
5813                                                )
5814                                        )
5815                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
5816                                        GROUP BY table_variants_from.\"POS\"
5817                                        )
5818                                        as table_parquet
5819                                """
5820
5821                                sql_query_annotation_where_clause = """
5822                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
5823                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5824                                """
5825
5826                            # Annotation with variants database
5827                            else:
5828                                sql_query_annotation_from_clause = f"""
5829                                    FROM {parquet_file_link} as table_parquet
5830                                """
5831                                sql_query_annotation_where_clause = f"""
5832                                    table_variants."#CHROM" = '{chrom}'
5833                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
5834                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5835                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
5836                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
5837                                """
5838
5839                            # Create update query
5840                            sql_query_annotation_chrom_interval_pos = f"""
5841                                UPDATE {table_variants} as table_variants
5842                                    SET INFO = 
5843                                        concat(
5844                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5845                                                THEN table_variants.INFO
5846                                                ELSE ''
5847                                            END
5848                                            ,
5849                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5850                                                        AND (
5851                                                        concat({sql_query_annotation_update_info_sets_sql})
5852                                                        )
5853                                                        NOT IN ('','.') 
5854                                                    THEN ';'
5855                                                    ELSE ''
5856                                            END
5857                                            ,
5858                                            {sql_query_annotation_update_info_sets_sql}
5859                                            )
5860                                    {sql_query_annotation_from_clause}
5861                                    WHERE {sql_query_annotation_where_clause}
5862                                    ;
5863                                """
5864
5865                            # Add update query to dict
5866                            query_dict[
5867                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
5868                            ] = sql_query_annotation_chrom_interval_pos
5869
5870                        nb_of_query = len(query_dict)
5871                        num_query = 0
5872
5873                        # SET max_expression_depth TO x
5874                        self.conn.execute("SET max_expression_depth TO 10000")
5875
5876                        for query_name in query_dict:
5877                            query = query_dict[query_name]
5878                            num_query += 1
5879                            log.info(
5880                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
5881                            )
5882                            result = self.conn.execute(query)
5883                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
5884                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
5885                            log.info(
5886                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
5887                            )
5888
5889                        log.info(
5890                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
5891                        )
5892
5893                    else:
5894
5895                        log.info(
5896                            f"Annotation '{annotation_name}' - No Annotations available"
5897                        )
5898
5899                    log.debug("Final header: " + str(vcf_reader.infos))
5900
5901        # Remove added columns
5902        for added_column in added_columns:
5903            self.drop_column(column=added_column)
5904
5905    def annotation_splice(self, threads: int = None) -> None:
5906        """
5907        This function annotate with snpEff
5908
5909        :param threads: The number of threads to use
5910        :return: the value of the variable "return_value".
5911        """
5912
5913        # DEBUG
5914        log.debug("Start annotation with splice tools")
5915
5916        # Threads
5917        if not threads:
5918            threads = self.get_threads()
5919        log.debug("Threads: " + str(threads))
5920
5921        # DEBUG
5922        delete_tmp = True
5923        if self.get_config().get("verbosity", "warning") in ["debug"]:
5924            delete_tmp = False
5925            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5926
5927        # Config
5928        config = self.get_config()
5929        log.debug("Config: " + str(config))
5930        splice_config = config.get("tools", {}).get("splice", {})
5931        if not splice_config:
5932            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
5933        if not splice_config:
5934            msg_err = "No Splice tool config"
5935            log.error(msg_err)
5936            raise ValueError(msg_err)
5937        log.debug(f"splice_config={splice_config}")
5938
5939        # Config - Folders - Databases
5940        databases_folders = (
5941            config.get("folders", {}).get("databases", {}).get("splice", ["."])
5942        )
5943        log.debug("Databases annotations: " + str(databases_folders))
5944
5945        # Splice docker image
5946        splice_docker_image = splice_config.get("docker").get("image")
5947
5948        # Pull splice image if it's not already there
5949        if not check_docker_image_exists(splice_docker_image):
5950            log.warning(
5951                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
5952            )
5953            try:
5954                command(f"docker pull {splice_config.get('docker').get('image')}")
5955            except subprocess.CalledProcessError:
5956                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
5957                log.error(msg_err)
5958                raise ValueError(msg_err)
5959                return None
5960
5961        # Config - splice databases
5962        splice_databases = (
5963            config.get("folders", {})
5964            .get("databases", {})
5965            .get("splice", DEFAULT_SPLICE_FOLDER)
5966        )
5967        splice_databases = full_path(splice_databases)
5968
5969        # Param
5970        param = self.get_param()
5971        log.debug("Param: " + str(param))
5972
5973        # Param
5974        options = param.get("annotation", {}).get("splice", {})
5975        log.debug("Options: " + str(options))
5976
5977        # Data
5978        table_variants = self.get_table_variants()
5979
5980        # Check if not empty
5981        log.debug("Check if not empty")
5982        sql_query_chromosomes = (
5983            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5984        )
5985        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
5986            log.info("VCF empty")
5987            return None
5988
5989        # Export in VCF
5990        log.debug("Create initial file to annotate")
5991
5992        # Create output folder
5993        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
5994        if not os.path.exists(output_folder):
5995            Path(output_folder).mkdir(parents=True, exist_ok=True)
5996
5997        # Create tmp VCF file
5998        tmp_vcf = NamedTemporaryFile(
5999            prefix=self.get_prefix(),
6000            dir=output_folder,
6001            suffix=".vcf",
6002            delete=False,
6003        )
6004        tmp_vcf_name = tmp_vcf.name
6005
6006        # VCF header
6007        header = self.get_header()
6008
6009        # Existing annotations
6010        for vcf_annotation in self.get_header().infos:
6011
6012            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
6013            log.debug(
6014                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
6015            )
6016
6017        # Memory limit
6018        if config.get("memory", None):
6019            memory_limit = config.get("memory", "8G").upper()
6020            # upper()
6021        else:
6022            memory_limit = "8G"
6023        log.debug(f"memory_limit: {memory_limit}")
6024
6025        # Check number of variants to annotate
6026        where_clause_regex_spliceai = r"SpliceAI_\w+"
6027        where_clause_regex_spip = r"SPiP_\w+"
6028        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
6029        df_list_of_variants_to_annotate = self.get_query_to_df(
6030            query=f""" SELECT * FROM variants {where_clause} """
6031        )
6032        if len(df_list_of_variants_to_annotate) == 0:
6033            log.warning(
6034                f"No variants to annotate with splice. Variants probably already annotated with splice"
6035            )
6036            return None
6037        else:
6038            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
6039
6040        # Export VCF file
6041        self.export_variant_vcf(
6042            vcf_file=tmp_vcf_name,
6043            remove_info=True,
6044            add_samples=True,
6045            index=False,
6046            where_clause=where_clause,
6047        )
6048
6049        # Create docker container and launch splice analysis
6050        if splice_config:
6051
6052            # Splice mount folders
6053            mount_folders = splice_config.get("mount", {})
6054
6055            # Genome mount
6056            mount_folders[
6057                config.get("folders", {})
6058                .get("databases", {})
6059                .get("genomes", DEFAULT_GENOME_FOLDER)
6060            ] = "ro"
6061
6062            # SpliceAI mount
6063            mount_folders[
6064                config.get("folders", {})
6065                .get("databases", {})
6066                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
6067            ] = "ro"
6068
6069            # Genome mount
6070            mount_folders[
6071                config.get("folders", {})
6072                .get("databases", {})
6073                .get("spip", DEFAULT_SPIP_FOLDER)
6074            ] = "ro"
6075
6076            # Mount folders
6077            mount = []
6078
6079            # Config mount
6080            mount = [
6081                f"-v {full_path(path)}:{full_path(path)}:{mode}"
6082                for path, mode in mount_folders.items()
6083            ]
6084
6085            if any(value for value in splice_config.values() if value is None):
6086                log.warning("At least one splice config parameter is empty")
6087                return None
6088
6089            # Params in splice nf
6090            def check_values(dico: dict):
6091                """
6092                Ensure parameters for NF splice pipeline
6093                """
6094                for key, val in dico.items():
6095                    if key == "genome":
6096                        if any(
6097                            assemb in options.get("genome", {})
6098                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
6099                        ):
6100                            yield f"--{key} hg19"
6101                        elif any(
6102                            assemb in options.get("genome", {})
6103                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
6104                        ):
6105                            yield f"--{key} hg38"
6106                    elif (
6107                        (isinstance(val, str) and val)
6108                        or isinstance(val, int)
6109                        or isinstance(val, bool)
6110                    ):
6111                        yield f"--{key} {val}"
6112
6113            # Genome
6114            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
6115            options["genome"] = genome
6116
6117            # NF params
6118            nf_params = []
6119
6120            # Add options
6121            if options:
6122                nf_params = list(check_values(options))
6123                log.debug(f"Splice NF params: {' '.join(nf_params)}")
6124            else:
6125                log.debug("No NF params provided")
6126
6127            # Add threads
6128            if "threads" not in options.keys():
6129                nf_params.append(f"--threads {threads}")
6130
6131            # Genome path
6132            genome_path = find_genome(
6133                config.get("folders", {})
6134                .get("databases", {})
6135                .get("genomes", DEFAULT_GENOME_FOLDER),
6136                file=f"{genome}.fa",
6137            )
6138            # Add genome path
6139            if not genome_path:
6140                raise ValueError(
6141                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
6142                )
6143            else:
6144                log.debug(f"Genome: {genome_path}")
6145                nf_params.append(f"--genome_path {genome_path}")
6146
6147            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
6148                """
6149                Setting up updated databases for SPiP and SpliceAI
6150                """
6151
6152                try:
6153
6154                    # SpliceAI assembly transcriptome
6155                    spliceai_assembly = os.path.join(
6156                        config.get("folders", {})
6157                        .get("databases", {})
6158                        .get("spliceai", {}),
6159                        options.get("genome"),
6160                        "transcriptome",
6161                    )
6162                    spip_assembly = options.get("genome")
6163
6164                    spip = find(
6165                        f"transcriptome_{spip_assembly}.RData",
6166                        config.get("folders", {}).get("databases", {}).get("spip", {}),
6167                    )
6168                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
6169                    log.debug(f"SPiP annotations: {spip}")
6170                    log.debug(f"SpliceAI annotations: {spliceai}")
6171                    if spip and spliceai:
6172                        return [
6173                            f"--spip_transcriptome {spip}",
6174                            f"--spliceai_annotations {spliceai}",
6175                        ]
6176                    else:
6177                        # TODO crash and go on with basic annotations ?
6178                        # raise ValueError(
6179                        #     "Can't find splice databases in configuration EXIT"
6180                        # )
6181                        log.warning(
6182                            "Can't find splice databases in configuration, use annotations file from image"
6183                        )
6184                except TypeError:
6185                    log.warning(
6186                        "Can't find splice databases in configuration, use annotations file from image"
6187                    )
6188                    return []
6189
6190            # Add options, check if transcriptome option have already beend provided
6191            if (
6192                "spip_transcriptome" not in nf_params
6193                and "spliceai_transcriptome" not in nf_params
6194            ):
6195                splice_reference = splice_annotations(options, config)
6196                if splice_reference:
6197                    nf_params.extend(splice_reference)
6198
6199            nf_params.append(f"--output_folder {output_folder}")
6200
6201            random_uuid = f"HOWARD-SPLICE-{get_random()}"
6202            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
6203            log.debug(cmd)
6204
6205            splice_config["docker"]["command"] = cmd
6206
6207            docker_cmd = get_bin_command(
6208                tool="splice",
6209                bin_type="docker",
6210                config=config,
6211                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
6212                add_options=f"--name {random_uuid} {' '.join(mount)}",
6213            )
6214
6215            # Docker debug
6216            # if splice_config.get("rm_container"):
6217            #     rm_container = "--rm"
6218            # else:
6219            #     rm_container = ""
6220            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
6221
6222            log.debug(docker_cmd)
6223            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
6224            log.debug(res.stdout)
6225            if res.stderr:
6226                log.error(res.stderr)
6227            res.check_returncode()
6228        else:
6229            log.warning(f"Splice tool configuration not found: {config}")
6230
6231        # Update variants
6232        log.info("Annotation - Updating...")
6233        # Test find output vcf
6234        log.debug(
6235            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6236        )
6237        output_vcf = []
6238        # Wrong folder to look in
6239        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
6240            if (
6241                files
6242                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6243            ):
6244                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
6245        # log.debug(os.listdir(options.get("output_folder")))
6246        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
6247        if not output_vcf:
6248            log.debug(
6249                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
6250            )
6251        else:
6252            # Get new header from annotated vcf
6253            log.debug(f"Initial header: {len(header.infos)} fields")
6254            # Create new header with splice infos
6255            new_vcf = Variants(input=output_vcf[0])
6256            new_vcf_header = new_vcf.get_header().infos
6257            for keys, infos in new_vcf_header.items():
6258                if keys not in header.infos.keys():
6259                    header.infos[keys] = infos
6260            log.debug(f"New header: {len(header.infos)} fields")
6261            log.debug(f"Splice tmp output: {output_vcf[0]}")
6262            self.update_from_vcf(output_vcf[0])
6263
6264        # Remove folder
6265        remove_if_exists(output_folder)
6266
6267    ###
6268    # Prioritization
6269    ###
6270
6271    def get_config_default(self, name: str) -> dict:
6272        """
6273        The function `get_config_default` returns a dictionary containing default configurations for
6274        various calculations and prioritizations.
6275
6276        :param name: The `get_config_default` function returns a dictionary containing default
6277        configurations for different calculations and prioritizations. The `name` parameter is used to
6278        specify which specific configuration to retrieve from the dictionary
6279        :type name: str
6280        :return: The function `get_config_default` returns a dictionary containing default configuration
6281        settings for different calculations and prioritizations. The specific configuration settings are
6282        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
6283        matches a key in the `config_default` dictionary, the corresponding configuration settings are
6284        returned. If there is no match, an empty dictionary is returned.
6285        """
6286
6287        config_default = {
6288            "calculations": {
6289                "variant_chr_pos_alt_ref": {
6290                    "type": "sql",
6291                    "name": "variant_chr_pos_alt_ref",
6292                    "description": "Create a variant ID with chromosome, position, alt and ref",
6293                    "available": False,
6294                    "output_column_name": "variant_chr_pos_alt_ref",
6295                    "output_column_type": "String",
6296                    "output_column_description": "variant ID with chromosome, position, alt and ref",
6297                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
6298                    "operation_info": True,
6299                },
6300                "VARTYPE": {
6301                    "type": "sql",
6302                    "name": "VARTYPE",
6303                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
6304                    "available": True,
6305                    "output_column_name": "VARTYPE",
6306                    "output_column_type": "String",
6307                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
6308                    "operation_query": """
6309                            CASE
6310                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
6311                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
6312                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
6313                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
6314                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
6315                                ELSE 'UNDEFINED'
6316                            END
6317                            """,
6318                    "info_fields": ["SVTYPE"],
6319                    "operation_info": True,
6320                },
6321                "snpeff_hgvs": {
6322                    "type": "python",
6323                    "name": "snpeff_hgvs",
6324                    "description": "HGVS nomenclatures from snpEff annotation",
6325                    "available": True,
6326                    "function_name": "calculation_extract_snpeff_hgvs",
6327                    "function_params": ["snpeff_hgvs", "ANN"],
6328                },
6329                "snpeff_ann_explode": {
6330                    "type": "python",
6331                    "name": "snpeff_ann_explode",
6332                    "description": "Explode snpEff annotations with uniquify values",
6333                    "available": True,
6334                    "function_name": "calculation_snpeff_ann_explode",
6335                    "function_params": [False, "fields", "snpeff_", "ANN"],
6336                },
6337                "snpeff_ann_explode_uniquify": {
6338                    "type": "python",
6339                    "name": "snpeff_ann_explode_uniquify",
6340                    "description": "Explode snpEff annotations",
6341                    "available": True,
6342                    "function_name": "calculation_snpeff_ann_explode",
6343                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
6344                },
6345                "snpeff_ann_explode_json": {
6346                    "type": "python",
6347                    "name": "snpeff_ann_explode_json",
6348                    "description": "Explode snpEff annotations in JSON format",
6349                    "available": True,
6350                    "function_name": "calculation_snpeff_ann_explode",
6351                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
6352                },
6353                "NOMEN": {
6354                    "type": "python",
6355                    "name": "NOMEN",
6356                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
6357                    "available": True,
6358                    "function_name": "calculation_extract_nomen",
6359                    "function_params": [],
6360                },
6361                "FINDBYPIPELINE": {
6362                    "type": "python",
6363                    "name": "FINDBYPIPELINE",
6364                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
6365                    "available": True,
6366                    "function_name": "calculation_find_by_pipeline",
6367                    "function_params": ["findbypipeline"],
6368                },
6369                "FINDBYSAMPLE": {
6370                    "type": "python",
6371                    "name": "FINDBYSAMPLE",
6372                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
6373                    "available": True,
6374                    "function_name": "calculation_find_by_pipeline",
6375                    "function_params": ["findbysample"],
6376                },
6377                "GENOTYPECONCORDANCE": {
6378                    "type": "python",
6379                    "name": "GENOTYPECONCORDANCE",
6380                    "description": "Concordance of genotype for multi caller VCF",
6381                    "available": True,
6382                    "function_name": "calculation_genotype_concordance",
6383                    "function_params": [],
6384                },
6385                "BARCODE": {
6386                    "type": "python",
6387                    "name": "BARCODE",
6388                    "description": "BARCODE as VaRank tool",
6389                    "available": True,
6390                    "function_name": "calculation_barcode",
6391                    "function_params": [],
6392                },
6393                "BARCODEFAMILY": {
6394                    "type": "python",
6395                    "name": "BARCODEFAMILY",
6396                    "description": "BARCODEFAMILY as VaRank tool",
6397                    "available": True,
6398                    "function_name": "calculation_barcode_family",
6399                    "function_params": ["BCF"],
6400                },
6401                "TRIO": {
6402                    "type": "python",
6403                    "name": "TRIO",
6404                    "description": "Inheritance for a trio family",
6405                    "available": True,
6406                    "function_name": "calculation_trio",
6407                    "function_params": [],
6408                },
6409                "VAF": {
6410                    "type": "python",
6411                    "name": "VAF",
6412                    "description": "Variant Allele Frequency (VAF) harmonization",
6413                    "available": True,
6414                    "function_name": "calculation_vaf_normalization",
6415                    "function_params": [],
6416                },
6417                "VAF_stats": {
6418                    "type": "python",
6419                    "name": "VAF_stats",
6420                    "description": "Variant Allele Frequency (VAF) statistics",
6421                    "available": True,
6422                    "function_name": "calculation_genotype_stats",
6423                    "function_params": ["VAF"],
6424                },
6425                "DP_stats": {
6426                    "type": "python",
6427                    "name": "DP_stats",
6428                    "description": "Depth (DP) statistics",
6429                    "available": True,
6430                    "function_name": "calculation_genotype_stats",
6431                    "function_params": ["DP"],
6432                },
6433                "variant_id": {
6434                    "type": "python",
6435                    "name": "variant_id",
6436                    "description": "Variant ID generated from variant position and type",
6437                    "available": True,
6438                    "function_name": "calculation_variant_id",
6439                    "function_params": [],
6440                },
6441            },
6442            "prioritizations": {
6443                "default": {
6444                    "filter": [
6445                        {
6446                            "type": "notequals",
6447                            "value": "!PASS|\\.",
6448                            "score": 0,
6449                            "flag": "FILTERED",
6450                            "comment": ["Bad variant quality"],
6451                        },
6452                        {
6453                            "type": "equals",
6454                            "value": "REJECT",
6455                            "score": -20,
6456                            "flag": "PASS",
6457                            "comment": ["Bad variant quality"],
6458                        },
6459                    ],
6460                    "DP": [
6461                        {
6462                            "type": "gte",
6463                            "value": "50",
6464                            "score": 5,
6465                            "flag": "PASS",
6466                            "comment": ["DP higher than 50"],
6467                        }
6468                    ],
6469                    "ANN": [
6470                        {
6471                            "type": "contains",
6472                            "value": "HIGH",
6473                            "score": 5,
6474                            "flag": "PASS",
6475                            "comment": [
6476                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
6477                            ],
6478                        },
6479                        {
6480                            "type": "contains",
6481                            "value": "MODERATE",
6482                            "score": 3,
6483                            "flag": "PASS",
6484                            "comment": [
6485                                "A non-disruptive variant that might change protein effectiveness"
6486                            ],
6487                        },
6488                        {
6489                            "type": "contains",
6490                            "value": "LOW",
6491                            "score": 0,
6492                            "flag": "FILTERED",
6493                            "comment": [
6494                                "Assumed to be mostly harmless or unlikely to change protein behavior"
6495                            ],
6496                        },
6497                        {
6498                            "type": "contains",
6499                            "value": "MODIFIER",
6500                            "score": 0,
6501                            "flag": "FILTERED",
6502                            "comment": [
6503                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
6504                            ],
6505                        },
6506                    ],
6507                }
6508            },
6509        }
6510
6511        return config_default.get(name, None)
6512
6513    def get_config_json(
6514        self, name: str, config_dict: dict = {}, config_file: str = None
6515    ) -> dict:
6516        """
6517        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
6518        default values, a dictionary, and a file.
6519
6520        :param name: The `name` parameter in the `get_config_json` function is a string that represents
6521        the name of the configuration. It is used to identify and retrieve the configuration settings
6522        for a specific component or module
6523        :type name: str
6524        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
6525        dictionary that allows you to provide additional configuration settings or overrides. When you
6526        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
6527        the key is the configuration setting you want to override or
6528        :type config_dict: dict
6529        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
6530        specify the path to a configuration file that contains additional settings. If provided, the
6531        function will read the contents of this file and update the configuration dictionary with the
6532        values found in the file, overriding any existing values with the
6533        :type config_file: str
6534        :return: The function `get_config_json` returns a dictionary containing the configuration
6535        settings.
6536        """
6537
6538        # Create with default prioritizations
6539        config_default = self.get_config_default(name=name)
6540        configuration = config_default
6541        # log.debug(f"configuration={configuration}")
6542
6543        # Replace prioritizations from dict
6544        for config in config_dict:
6545            configuration[config] = config_dict[config]
6546
6547        # Replace prioritizations from file
6548        config_file = full_path(config_file)
6549        if config_file:
6550            if os.path.exists(config_file):
6551                with open(config_file) as config_file_content:
6552                    config_file_dict = json.load(config_file_content)
6553                for config in config_file_dict:
6554                    configuration[config] = config_file_dict[config]
6555            else:
6556                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
6557                log.error(msg_error)
6558                raise ValueError(msg_error)
6559
6560        return configuration
6561
6562    def prioritization(self) -> None:
6563        """
6564        It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other
6565        INFO fields
6566        """
6567
6568        # Config
6569        config = self.get_config()
6570
6571        # Param
6572        param = self.get_param()
6573
6574        # Quick Prioritizations
6575        # prioritizations = param.get("prioritization", {}).get("prioritizations", "")
6576
6577        # Configuration profiles
6578        prioritization_config_file = param.get("prioritization", {}).get(
6579            "prioritization_config", None
6580        )
6581        prioritization_config_file = full_path(prioritization_config_file)
6582        prioritizations_config = self.get_config_json(
6583            name="prioritizations", config_file=prioritization_config_file
6584        )
6585
6586        # Prioritization options
6587        profiles = param.get("prioritization", {}).get("profiles", [])
6588        if isinstance(profiles, str):
6589            profiles = profiles.split(",")
6590        pzfields = param.get("prioritization", {}).get(
6591            "pzfields", ["PZFlag", "PZScore"]
6592        )
6593        if isinstance(pzfields, str):
6594            pzfields = pzfields.split(",")
6595        default_profile = param.get("prioritization", {}).get("default_profile", None)
6596        pzfields_sep = param.get("prioritization", {}).get("pzfields_sep", "_")
6597        prioritization_score_mode = param.get("prioritization", {}).get(
6598            "prioritization_score_mode", "HOWARD"
6599        )
6600
6601        # Quick Prioritizations
6602        # prioritizations = param.get("prioritization", {}).get("prioritizations", None)
6603        prioritizations = param.get("prioritizations", None)
6604        if prioritizations:
6605            log.info("Quick Prioritization:")
6606            for profile in prioritizations.split(","):
6607                if profile not in profiles:
6608                    profiles.append(profile)
6609                    log.info(f"   {profile}")
6610
6611        # If profile "ALL" provided, all profiles in the config profiles
6612        if "ALL" in profiles:
6613            profiles = list(prioritizations_config.keys())
6614
6615        for profile in profiles:
6616            if prioritizations_config.get(profile, None):
6617                log.debug(f"Profile '{profile}' configured")
6618            else:
6619                msg_error = f"Profile '{profile}' NOT configured"
6620                log.error(msg_error)
6621                raise ValueError(msg_error)
6622
6623        if profiles:
6624            log.info(f"Prioritization... ")
6625        else:
6626            log.debug(f"No profile defined")
6627            return
6628
6629        if not default_profile and len(profiles):
6630            default_profile = profiles[0]
6631
6632        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
6633        log.debug("Profiles to check: " + str(list(profiles)))
6634
6635        # Variables
6636        table_variants = self.get_table_variants(clause="update")
6637
6638        # Added columns
6639        added_columns = []
6640
6641        # Create list of PZfields
6642        # List of PZFields
6643        list_of_pzfields_original = pzfields + [
6644            pzfield + pzfields_sep + profile
6645            for pzfield in pzfields
6646            for profile in profiles
6647        ]
6648        list_of_pzfields = []
6649        log.debug(f"{list_of_pzfields_original}")
6650
6651        # Remove existing PZfields to use if exists
6652        for pzfield in list_of_pzfields_original:
6653            if self.get_header().infos.get(pzfield, None) is None:
6654                list_of_pzfields.append(pzfield)
6655                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
6656            else:
6657                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
6658
6659        if list_of_pzfields:
6660
6661            # Explode Infos fields
6662            explode_infos_prefix = self.get_explode_infos_prefix()
6663            added_columns += self.explode_infos(prefix=explode_infos_prefix)
6664            extra_infos = self.get_extra_infos()
6665
6666            # PZfields tags description
6667            PZfields_INFOS = {
6668                "PZTags": {
6669                    "ID": "PZTags",
6670                    "Number": ".",
6671                    "Type": "String",
6672                    "Description": "Variant tags based on annotation criteria",
6673                },
6674                "PZScore": {
6675                    "ID": "PZScore",
6676                    "Number": 1,
6677                    "Type": "Integer",
6678                    "Description": "Variant score based on annotation criteria",
6679                },
6680                "PZFlag": {
6681                    "ID": "PZFlag",
6682                    "Number": 1,
6683                    "Type": "String",
6684                    "Description": "Variant flag based on annotation criteria",
6685                },
6686                "PZComment": {
6687                    "ID": "PZComment",
6688                    "Number": ".",
6689                    "Type": "String",
6690                    "Description": "Variant comment based on annotation criteria",
6691                },
6692                "PZInfos": {
6693                    "ID": "PZInfos",
6694                    "Number": ".",
6695                    "Type": "String",
6696                    "Description": "Variant infos based on annotation criteria",
6697                },
6698            }
6699
6700            # Create INFO fields if not exist
6701            for field in PZfields_INFOS:
6702                field_ID = PZfields_INFOS[field]["ID"]
6703                field_description = PZfields_INFOS[field]["Description"]
6704                if field_ID not in self.get_header().infos and field_ID in pzfields:
6705                    field_description = (
6706                        PZfields_INFOS[field]["Description"]
6707                        + f", profile {default_profile}"
6708                    )
6709                    self.get_header().infos[field_ID] = vcf.parser._Info(
6710                        field_ID,
6711                        PZfields_INFOS[field]["Number"],
6712                        PZfields_INFOS[field]["Type"],
6713                        field_description,
6714                        "unknown",
6715                        "unknown",
6716                        code_type_map[PZfields_INFOS[field]["Type"]],
6717                    )
6718
6719            # Create INFO fields if not exist for each profile
6720            for profile in prioritizations_config:
6721                if profile in profiles or profiles == []:
6722                    for field in PZfields_INFOS:
6723                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
6724                        field_description = (
6725                            PZfields_INFOS[field]["Description"]
6726                            + f", profile {profile}"
6727                        )
6728                        if (
6729                            field_ID not in self.get_header().infos
6730                            and field in pzfields
6731                        ):
6732                            self.get_header().infos[field_ID] = vcf.parser._Info(
6733                                field_ID,
6734                                PZfields_INFOS[field]["Number"],
6735                                PZfields_INFOS[field]["Type"],
6736                                field_description,
6737                                "unknown",
6738                                "unknown",
6739                                code_type_map[PZfields_INFOS[field]["Type"]],
6740                            )
6741
6742            # Header
6743            for pzfield in list_of_pzfields:
6744                if re.match("PZScore.*", pzfield):
6745                    added_column = self.add_column(
6746                        table_name=table_variants,
6747                        column_name=pzfield,
6748                        column_type="INTEGER",
6749                        default_value="0",
6750                    )
6751                elif re.match("PZFlag.*", pzfield):
6752                    added_column = self.add_column(
6753                        table_name=table_variants,
6754                        column_name=pzfield,
6755                        column_type="BOOLEAN",
6756                        default_value="1",
6757                    )
6758                else:
6759                    added_column = self.add_column(
6760                        table_name=table_variants,
6761                        column_name=pzfield,
6762                        column_type="STRING",
6763                        default_value="''",
6764                    )
6765                added_columns.append(added_column)
6766
6767            # Profiles
6768            if profiles:
6769
6770                # foreach profile in configuration file
6771                for profile in prioritizations_config:
6772
6773                    # If profile is asked in param, or ALL are asked (empty profile [])
6774                    if profile in profiles or profiles == []:
6775                        log.info(f"Profile '{profile}'")
6776
6777                        sql_set_info_option = ""
6778
6779                        sql_set_info = []
6780
6781                        # PZ fields set
6782
6783                        # PZScore
6784                        if f"PZScore{pzfields_sep}{profile}" in list_of_pzfields:
6785                            sql_set_info.append(
6786                                f"""
6787                                    concat(
6788                                        'PZScore{pzfields_sep}{profile}=',
6789                                        PZScore{pzfields_sep}{profile}
6790                                    ) 
6791                                """
6792                            )
6793                            if (
6794                                profile == default_profile
6795                                and "PZScore" in list_of_pzfields
6796                            ):
6797                                sql_set_info.append(
6798                                    f"""
6799                                        concat(
6800                                            'PZScore=',
6801                                            PZScore{pzfields_sep}{profile}
6802                                        )
6803                                    """
6804                                )
6805
6806                        # PZFlag
6807                        if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields:
6808                            sql_set_info.append(
6809                                f"""
6810                                    concat(
6811                                        'PZFlag{pzfields_sep}{profile}=',
6812                                        CASE 
6813                                            WHEN PZFlag{pzfields_sep}{profile}==1
6814                                            THEN 'PASS'
6815                                            WHEN PZFlag{pzfields_sep}{profile}==0
6816                                            THEN 'FILTERED'
6817                                        END
6818                                    ) 
6819                                """
6820                            )
6821                            if (
6822                                profile == default_profile
6823                                and "PZFlag" in list_of_pzfields
6824                            ):
6825                                sql_set_info.append(
6826                                    f"""
6827                                        concat(
6828                                            'PZFlag=',
6829                                            CASE 
6830                                                WHEN PZFlag{pzfields_sep}{profile}==1
6831                                                THEN 'PASS'
6832                                                WHEN PZFlag{pzfields_sep}{profile}==0
6833                                                THEN 'FILTERED'
6834                                            END
6835                                        )
6836                                    """
6837                                )
6838
6839                        # PZComment
6840                        if f"PZComment{pzfields_sep}{profile}" in list_of_pzfields:
6841                            sql_set_info.append(
6842                                f"""
6843                                    CASE
6844                                        WHEN PZComment{pzfields_sep}{profile} NOT IN ('')
6845                                        THEN concat('PZComment{pzfields_sep}{profile}=', PZComment{pzfields_sep}{profile})
6846                                        ELSE ''
6847                                    END
6848                                """
6849                            )
6850                            if (
6851                                profile == default_profile
6852                                and "PZComment" in list_of_pzfields
6853                            ):
6854                                sql_set_info.append(
6855                                    f"""
6856                                        CASE
6857                                            WHEN PZComment{pzfields_sep}{profile} NOT IN ('')
6858                                            THEN concat('PZComment=', PZComment{pzfields_sep}{profile})
6859                                            ELSE ''
6860                                        END
6861                                    """
6862                                )
6863
6864                        # PZInfos
6865                        if f"PZInfos{pzfields_sep}{profile}" in list_of_pzfields:
6866                            sql_set_info.append(
6867                                f"""
6868                                    CASE
6869                                        WHEN PZInfos{pzfields_sep}{profile} NOT IN ('')
6870                                        THEN concat('PZInfos{pzfields_sep}{profile}=', PZInfos{pzfields_sep}{profile})
6871                                        ELSE ''
6872                                    END
6873                                """
6874                            )
6875                            if (
6876                                profile == default_profile
6877                                and "PZInfos" in list_of_pzfields
6878                            ):
6879                                sql_set_info.append(
6880                                    f"""
6881                                        CASE
6882                                            WHEN PZInfos{pzfields_sep}{profile} NOT IN ('')
6883                                            THEN concat('PZInfos=', PZInfos{pzfields_sep}{profile})
6884                                            ELSE ''
6885                                        END
6886                                    """
6887                                )
6888
6889                        # Merge PZfields
6890                        sql_set_info_option = ""
6891                        sql_set_sep = ""
6892                        for sql_set in sql_set_info:
6893                            if sql_set_sep:
6894                                sql_set_info_option += f"""
6895                                    , concat('{sql_set_sep}', {sql_set})
6896                                """
6897                            else:
6898                                sql_set_info_option += f"""
6899                                    , {sql_set}
6900                                """
6901                            sql_set_sep = ";"
6902
6903                        sql_queries = []
6904                        for annotation in prioritizations_config[profile]:
6905
6906                            # Check if annotation field is present
6907                            if not f"{explode_infos_prefix}{annotation}" in extra_infos:
6908                                log.debug(f"Annotation '{annotation}' not in data")
6909                                continue
6910                            else:
6911                                log.debug(f"Annotation '{annotation}' in data")
6912
6913                            # For each criterions
6914                            for criterion in prioritizations_config[profile][
6915                                annotation
6916                            ]:
6917                                criterion_type = criterion["type"]
6918                                criterion_value = criterion["value"]
6919                                criterion_score = criterion.get("score", 0)
6920                                criterion_flag = criterion.get("flag", "PASS")
6921                                criterion_flag_bool = criterion_flag == "PASS"
6922                                criterion_comment = (
6923                                    ", ".join(criterion.get("comment", []))
6924                                    .replace("'", "''")
6925                                    .replace(";", ",")
6926                                    .replace("\t", " ")
6927                                )
6928                                criterion_infos = (
6929                                    str(criterion)
6930                                    .replace("'", "''")
6931                                    .replace(";", ",")
6932                                    .replace("\t", " ")
6933                                )
6934
6935                                sql_set = []
6936                                sql_set_info = []
6937
6938                                # PZ fields set
6939                                if (
6940                                    f"PZScore{pzfields_sep}{profile}"
6941                                    in list_of_pzfields
6942                                ):
6943                                    if prioritization_score_mode == "HOWARD":
6944                                        sql_set.append(
6945                                            f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}"
6946                                        )
6947                                    elif prioritization_score_mode == "VaRank":
6948                                        sql_set.append(
6949                                            f"PZScore{pzfields_sep}{profile} = CASE WHEN {criterion_score}>PZScore{pzfields_sep}{profile} THEN {criterion_score} END"
6950                                        )
6951                                    else:
6952                                        sql_set.append(
6953                                            f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}"
6954                                        )
6955                                if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields:
6956                                    sql_set.append(
6957                                        f"PZFlag{pzfields_sep}{profile} = PZFlag{pzfields_sep}{profile} AND {criterion_flag_bool}"
6958                                    )
6959                                if (
6960                                    f"PZComment{pzfields_sep}{profile}"
6961                                    in list_of_pzfields
6962                                ):
6963                                    sql_set.append(
6964                                        f"""
6965                                            PZComment{pzfields_sep}{profile} = 
6966                                                concat(
6967                                                    PZComment{pzfields_sep}{profile},
6968                                                    CASE 
6969                                                        WHEN PZComment{pzfields_sep}{profile}!=''
6970                                                        THEN ', '
6971                                                        ELSE ''
6972                                                    END,
6973                                                    '{criterion_comment}'
6974                                                )
6975                                        """
6976                                    )
6977                                if (
6978                                    f"PZInfos{pzfields_sep}{profile}"
6979                                    in list_of_pzfields
6980                                ):
6981                                    sql_set.append(
6982                                        f"""
6983                                            PZInfos{pzfields_sep}{profile} = 
6984                                                concat(
6985                                                    PZInfos{pzfields_sep}{profile},
6986                                                    '{criterion_infos}'
6987                                                )
6988                                        """
6989                                    )
6990                                sql_set_option = ",".join(sql_set)
6991
6992                                # Criterion and comparison
6993                                try:
6994                                    float(criterion_value)
6995                                    sql_update = f"""
6996                                        UPDATE {table_variants}
6997                                        SET {sql_set_option}
6998                                        WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
6999                                        AND "{explode_infos_prefix}{annotation}"{comparison_map[criterion_type]}{criterion_value}
7000                                        """
7001                                except:
7002                                    contains_option = ""
7003                                    if criterion_type == "contains":
7004                                        contains_option = ".*"
7005                                    sql_update = f"""
7006                                        UPDATE {table_variants}
7007                                        SET {sql_set_option}
7008                                        WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
7009                                        """
7010                                sql_queries.append(sql_update)
7011
7012                        # PZTags
7013                        if f"PZTags{pzfields_sep}{profile}" in list_of_pzfields:
7014
7015                            # Create PZFalgs value
7016                            pztags_value = ""
7017                            pztags_sep_default = "|"
7018                            pztags_sep = ""
7019                            for pzfield in pzfields:
7020                                if pzfield not in ["PZTags"]:
7021                                    if (
7022                                        f"{pzfield}{pzfields_sep}{profile}"
7023                                        in list_of_pzfields
7024                                    ):
7025                                        if pzfield in ["PZFlag"]:
7026                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7027                                                CASE WHEN PZFlag{pzfields_sep}{profile}
7028                                                    THEN 'PASS'
7029                                                    ELSE 'FILTERED'
7030                                                END, '"""
7031                                        else:
7032                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
7033                                        pztags_sep = pztags_sep_default
7034
7035                            # Add Query update for PZFlags
7036                            sql_update_pztags = f"""
7037                                UPDATE {table_variants}
7038                                SET INFO = concat(
7039                                        INFO,
7040                                        CASE WHEN INFO NOT in ('','.')
7041                                                THEN ';'
7042                                                ELSE ''
7043                                        END,
7044                                        'PZTags{pzfields_sep}{profile}={pztags_value}'
7045                                    )
7046                                """
7047                            sql_queries.append(sql_update_pztags)
7048
7049                            # Add Query update for PZFlags for default
7050                            if profile == default_profile:
7051                                sql_update_pztags_default = f"""
7052                                UPDATE {table_variants}
7053                                SET INFO = concat(
7054                                        INFO,
7055                                        ';',
7056                                        'PZTags={pztags_value}'
7057                                    )
7058                                """
7059                                sql_queries.append(sql_update_pztags_default)
7060
7061                        log.info(f"""Profile '{profile}' - Prioritization... """)
7062
7063                        if sql_queries:
7064
7065                            for sql_query in sql_queries:
7066                                log.debug(
7067                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
7068                                )
7069                                self.conn.execute(sql_query)
7070
7071                        log.info(f"""Profile '{profile}' - Update... """)
7072                        sql_query_update = f"""
7073                            UPDATE {table_variants}
7074                            SET INFO =  
7075                                concat(
7076                                    CASE
7077                                        WHEN INFO NOT IN ('','.')
7078                                        THEN concat(INFO, ';')
7079                                        ELSE ''
7080                                    END
7081                                    {sql_set_info_option}
7082                                )
7083                        """
7084                        self.conn.execute(sql_query_update)
7085
7086        else:
7087
7088            log.warning(f"No profiles in parameters")
7089
7090        # Remove added columns
7091        for added_column in added_columns:
7092            self.drop_column(column=added_column)
7093
7094        # Explode INFOS fields into table fields
7095        if self.get_explode_infos():
7096            self.explode_infos(
7097                prefix=self.get_explode_infos_prefix(),
7098                fields=self.get_explode_infos_fields(),
7099                force=True,
7100            )
7101
7102        return
7103
7104    ###
7105    # HGVS
7106    ###
7107
7108    def annotation_hgvs(self, threads: int = None) -> None:
7109        """
7110        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
7111        coordinates and alleles.
7112
7113        :param threads: The `threads` parameter is an optional integer that specifies the number of
7114        threads to use for parallel processing. If no value is provided, it will default to the number
7115        of threads obtained from the `get_threads()` method
7116        :type threads: int
7117        """
7118
7119        # Function for each partition of the Dask Dataframe
7120        def partition_function(partition):
7121            """
7122            The function `partition_function` applies the `annotation_hgvs_partition` function to
7123            each row of a DataFrame called `partition`.
7124
7125            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
7126            to be processed
7127            :return: the result of applying the "annotation_hgvs_partition" function to each row of
7128            the "partition" dataframe along the axis 1.
7129            """
7130            return partition.apply(annotation_hgvs_partition, axis=1)
7131
7132        def annotation_hgvs_partition(row) -> str:
7133            """
7134            The function `annotation_hgvs_partition` takes in a row of data and returns a string
7135            containing a list of HGVS names associated with the given genomic coordinates and alleles.
7136
7137            :param row: A dictionary-like object that contains the values for the following keys:
7138            :return: a string that contains the HGVS names associated with the given row of data.
7139            """
7140
7141            chr = row["CHROM"]
7142            pos = row["POS"]
7143            ref = row["REF"]
7144            alt = row["ALT"]
7145
7146            # Find list of associated transcripts
7147            transcripts_list = list(
7148                polars_conn.execute(
7149                    f"""
7150                SELECT transcript
7151                FROM refseq_df
7152                WHERE CHROM='{chr}'
7153                AND POS={pos}
7154            """
7155                )["transcript"]
7156            )
7157
7158            # Full HGVS annotation in list
7159            hgvs_full_list = []
7160
7161            for transcript_name in transcripts_list:
7162
7163                # Transcript
7164                transcript = get_transcript(
7165                    transcripts=transcripts, transcript_name=transcript_name
7166                )
7167                # Exon
7168                if use_exon:
7169                    exon = transcript.find_exon_number(pos)
7170                else:
7171                    exon = None
7172                # Protein
7173                transcript_protein = None
7174                if use_protein or add_protein or full_format:
7175                    transcripts_protein = list(
7176                        polars_conn.execute(
7177                            f"""
7178                        SELECT protein
7179                        FROM refseqlink_df
7180                        WHERE transcript='{transcript_name}'
7181                        LIMIT 1
7182                    """
7183                        )["protein"]
7184                    )
7185                    if len(transcripts_protein):
7186                        transcript_protein = transcripts_protein[0]
7187
7188                # HGVS name
7189                hgvs_name = format_hgvs_name(
7190                    chr,
7191                    pos,
7192                    ref,
7193                    alt,
7194                    genome=genome,
7195                    transcript=transcript,
7196                    transcript_protein=transcript_protein,
7197                    exon=exon,
7198                    use_gene=use_gene,
7199                    use_protein=use_protein,
7200                    full_format=full_format,
7201                    use_version=use_version,
7202                    codon_type=codon_type,
7203                )
7204                hgvs_full_list.append(hgvs_name)
7205                if add_protein and not use_protein and not full_format:
7206                    hgvs_name = format_hgvs_name(
7207                        chr,
7208                        pos,
7209                        ref,
7210                        alt,
7211                        genome=genome,
7212                        transcript=transcript,
7213                        transcript_protein=transcript_protein,
7214                        exon=exon,
7215                        use_gene=use_gene,
7216                        use_protein=True,
7217                        full_format=False,
7218                        use_version=use_version,
7219                        codon_type=codon_type,
7220                    )
7221                    hgvs_full_list.append(hgvs_name)
7222
7223            # Create liste of HGVS annotations
7224            hgvs_full = ",".join(hgvs_full_list)
7225
7226            return hgvs_full
7227
7228        # Polars connexion
7229        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7230
7231        # Config
7232        config = self.get_config()
7233
7234        # Databases
7235        # Genome
7236        databases_genomes_folders = (
7237            config.get("folders", {})
7238            .get("databases", {})
7239            .get("genomes", DEFAULT_GENOME_FOLDER)
7240        )
7241        databases_genome = (
7242            config.get("folders", {}).get("databases", {}).get("genomes", "")
7243        )
7244        # refseq database folder
7245        databases_refseq_folders = (
7246            config.get("folders", {})
7247            .get("databases", {})
7248            .get("refseq", DEFAULT_REFSEQ_FOLDER)
7249        )
7250        # refseq
7251        databases_refseq = config.get("databases", {}).get("refSeq", None)
7252        # refSeqLink
7253        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
7254
7255        # Param
7256        param = self.get_param()
7257
7258        # Quick HGVS
7259        if "hgvs_options" in param and param.get("hgvs_options", ""):
7260            log.info(f"Quick HGVS Annotation:")
7261            if not param.get("hgvs", None):
7262                param["hgvs"] = {}
7263            for option in param.get("hgvs_options", "").split(","):
7264                option_var_val = option.split("=")
7265                option_var = option_var_val[0]
7266                if len(option_var_val) > 1:
7267                    option_val = option_var_val[1]
7268                else:
7269                    option_val = "True"
7270                if option_val.upper() in ["TRUE"]:
7271                    option_val = True
7272                elif option_val.upper() in ["FALSE"]:
7273                    option_val = False
7274                log.info(f"   {option_var}={option_val}")
7275                param["hgvs"][option_var] = option_val
7276
7277        # Check if HGVS annotation enabled
7278        if "hgvs" in param:
7279            log.info(f"HGVS Annotation... ")
7280            for hgvs_option in param.get("hgvs", {}):
7281                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
7282        else:
7283            return
7284
7285        # HGVS Param
7286        param_hgvs = param.get("hgvs", {})
7287        use_exon = param_hgvs.get("use_exon", False)
7288        use_gene = param_hgvs.get("use_gene", False)
7289        use_protein = param_hgvs.get("use_protein", False)
7290        add_protein = param_hgvs.get("add_protein", False)
7291        full_format = param_hgvs.get("full_format", False)
7292        use_version = param_hgvs.get("use_version", False)
7293        codon_type = param_hgvs.get("codon_type", "3")
7294
7295        # refSseq refSeqLink
7296        databases_refseq = param_hgvs.get("refseq", databases_refseq)
7297        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
7298
7299        # Assembly
7300        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
7301
7302        # Genome
7303        genome_file = None
7304        if find_genome(databases_genome):
7305            genome_file = find_genome(databases_genome)
7306        else:
7307            genome_file = find_genome(
7308                genome_path=databases_genomes_folders, assembly=assembly
7309            )
7310        log.debug("Genome: " + str(genome_file))
7311
7312        # refSseq
7313        refseq_file = find_file_prefix(
7314            input_file=databases_refseq,
7315            prefix="ncbiRefSeq",
7316            folder=databases_refseq_folders,
7317            assembly=assembly,
7318        )
7319        log.debug("refSeq: " + str(refseq_file))
7320
7321        # refSeqLink
7322        refseqlink_file = find_file_prefix(
7323            input_file=databases_refseqlink,
7324            prefix="ncbiRefSeqLink",
7325            folder=databases_refseq_folders,
7326            assembly=assembly,
7327        )
7328        log.debug("refSeqLink: " + str(refseqlink_file))
7329
7330        # Threads
7331        if not threads:
7332            threads = self.get_threads()
7333        log.debug("Threads: " + str(threads))
7334
7335        # Variables
7336        table_variants = self.get_table_variants(clause="update")
7337
7338        # Get variants SNV and InDel only
7339        query_variants = f"""
7340            SELECT "#CHROM" AS CHROM, POS, REF, ALT
7341            FROM {table_variants}
7342            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
7343            """
7344        df_variants = self.get_query_to_df(query_variants)
7345
7346        # Added columns
7347        added_columns = []
7348
7349        # Add hgvs column in variants table
7350        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
7351        added_column = self.add_column(
7352            table_variants, hgvs_column_name, "STRING", default_value=None
7353        )
7354        added_columns.append(added_column)
7355
7356        log.debug(f"refSeq loading...")
7357        # refSeq in duckDB
7358        refseq_table = get_refseq_table(
7359            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
7360        )
7361        # Loading all refSeq in Dataframe
7362        refseq_query = f"""
7363            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
7364            FROM {refseq_table}
7365            JOIN df_variants ON (
7366                {refseq_table}.chrom = df_variants.CHROM
7367                AND {refseq_table}.txStart<=df_variants.POS
7368                AND {refseq_table}.txEnd>=df_variants.POS
7369            )
7370        """
7371        refseq_df = self.conn.query(refseq_query).pl()
7372
7373        if refseqlink_file:
7374            log.debug(f"refSeqLink loading...")
7375            # refSeqLink in duckDB
7376            refseqlink_table = get_refseq_table(
7377                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
7378            )
7379            # Loading all refSeqLink in Dataframe
7380            protacc_column = "protAcc_with_ver"
7381            mrnaacc_column = "mrnaAcc_with_ver"
7382            refseqlink_query = f"""
7383                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
7384                FROM {refseqlink_table} 
7385                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
7386                WHERE protAcc_without_ver IS NOT NULL
7387            """
7388            # Polars Dataframe
7389            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
7390
7391        # Read RefSeq transcripts into a python dict/model.
7392        log.debug(f"Transcripts loading...")
7393        with tempfile.TemporaryDirectory() as tmpdir:
7394            transcripts_query = f"""
7395                COPY (
7396                    SELECT {refseq_table}.*
7397                    FROM {refseq_table}
7398                    JOIN df_variants ON (
7399                        {refseq_table}.chrom=df_variants.CHROM
7400                        AND {refseq_table}.txStart<=df_variants.POS
7401                        AND {refseq_table}.txEnd>=df_variants.POS
7402                    )
7403                )
7404                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
7405            """
7406            self.conn.query(transcripts_query)
7407            with open(f"{tmpdir}/transcript.tsv") as infile:
7408                transcripts = read_transcripts(infile)
7409
7410        # Polars connexion
7411        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7412
7413        log.debug("Genome loading...")
7414        # Read genome sequence using pyfaidx.
7415        genome = Fasta(genome_file)
7416
7417        log.debug("Start annotation HGVS...")
7418
7419        # Create
7420        # a Dask Dataframe from Pandas dataframe with partition as number of threads
7421        ddf = dd.from_pandas(df_variants, npartitions=threads)
7422
7423        # Use dask.dataframe.apply() to apply function on each partition
7424        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
7425
7426        # Convert Dask DataFrame to Pandas Dataframe
7427        df = ddf.compute()
7428
7429        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
7430        with tempfile.TemporaryDirectory() as tmpdir:
7431            df_parquet = os.path.join(tmpdir, "df.parquet")
7432            df.to_parquet(df_parquet)
7433
7434            # Update hgvs column
7435            update_variant_query = f"""
7436                UPDATE {table_variants}
7437                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
7438                FROM read_parquet('{df_parquet}') as df
7439                WHERE variants."#CHROM" = df.CHROM
7440                AND variants.POS = df.POS
7441                AND variants.REF = df.REF
7442                AND variants.ALT = df.ALT
7443                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
7444                """
7445            self.execute_query(update_variant_query)
7446
7447        # Update INFO column
7448        sql_query_update = f"""
7449            UPDATE {table_variants}
7450            SET INFO = 
7451                concat(
7452                    CASE 
7453                        WHEN INFO NOT IN ('','.')
7454                        THEN concat(INFO, ';')
7455                        ELSE ''
7456                    END,
7457                    'hgvs=',
7458                    {hgvs_column_name}
7459                )
7460            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
7461            """
7462        self.execute_query(sql_query_update)
7463
7464        # Add header
7465        HGVS_INFOS = {
7466            "hgvs": {
7467                "ID": "hgvs",
7468                "Number": ".",
7469                "Type": "String",
7470                "Description": f"HGVS annotatation with HOWARD",
7471            }
7472        }
7473
7474        for field in HGVS_INFOS:
7475            field_ID = HGVS_INFOS[field]["ID"]
7476            field_description = HGVS_INFOS[field]["Description"]
7477            self.get_header().infos[field_ID] = vcf.parser._Info(
7478                field_ID,
7479                HGVS_INFOS[field]["Number"],
7480                HGVS_INFOS[field]["Type"],
7481                field_description,
7482                "unknown",
7483                "unknown",
7484                code_type_map[HGVS_INFOS[field]["Type"]],
7485            )
7486
7487        # Remove added columns
7488        for added_column in added_columns:
7489            self.drop_column(column=added_column)
7490
7491    ###
7492    # Calculation
7493    ###
7494
7495    def get_operations_help(
7496        self, operations_config_dict: dict = {}, operations_config_file: str = None
7497    ) -> list:
7498
7499        # Init
7500        operations_help = []
7501
7502        # operations
7503        operations = self.get_config_json(
7504            name="calculations",
7505            config_dict=operations_config_dict,
7506            config_file=operations_config_file,
7507        )
7508        for op in operations:
7509            op_name = operations[op].get("name", op).upper()
7510            op_description = operations[op].get("description", op_name)
7511            op_available = operations[op].get("available", False)
7512            if op_available:
7513                operations_help.append(f"   {op_name}: {op_description}")
7514
7515        # Sort operations
7516        operations_help.sort()
7517
7518        # insert header
7519        operations_help.insert(0, "Available calculation operations:")
7520
7521        # Return
7522        return operations_help
7523
7524    def calculation(
7525        self,
7526        operations: dict = {},
7527        operations_config_dict: dict = {},
7528        operations_config_file: str = None,
7529    ) -> None:
7530        """
7531        It takes a list of operations, and for each operation, it checks if it's a python or sql
7532        operation, and then calls the appropriate function
7533
7534        param json example:
7535            "calculation": {
7536                "NOMEN": {
7537                    "options": {
7538                        "hgvs_field": "hgvs"
7539                    },
7540                "middle" : null
7541            }
7542        """
7543
7544        # Param
7545        param = self.get_param()
7546
7547        # operations config
7548        operations_config = self.get_config_json(
7549            name="calculations",
7550            config_dict=operations_config_dict,
7551            config_file=operations_config_file,
7552        )
7553
7554        # Upper keys
7555        operations_config = {k.upper(): v for k, v in operations_config.items()}
7556
7557        # Calculations
7558
7559        # Operations from param
7560        operations = param.get("calculation", {}).get("calculations", operations)
7561
7562        # Quick calculation - add
7563        if param.get("calculations", None):
7564            calculations_list = [
7565                value for value in param.get("calculations", "").split(",")
7566            ]
7567            log.info(f"Quick Calculations:")
7568            for calculation_key in calculations_list:
7569                log.info(f"   {calculation_key}")
7570            for calculation_operation in calculations_list:
7571                if calculation_operation.upper() not in operations:
7572                    operations[calculation_operation.upper()] = {}
7573                    add_value_into_dict(
7574                        dict_tree=param,
7575                        sections=[
7576                            "calculation",
7577                            "calculations",
7578                            calculation_operation.upper(),
7579                        ],
7580                        value={},
7581                    )
7582
7583        # Operations for calculation
7584        if not operations:
7585            operations = param.get("calculation", {}).get("calculations", {})
7586
7587        if operations:
7588            log.info(f"Calculations...")
7589
7590        # For each operations
7591        for operation_name in operations:
7592            operation_name = operation_name.upper()
7593            if operation_name not in [""]:
7594                if operation_name in operations_config:
7595                    log.info(f"Calculation '{operation_name}'")
7596                    operation = operations_config[operation_name]
7597                    operation_type = operation.get("type", "sql")
7598                    if operation_type == "python":
7599                        self.calculation_process_function(
7600                            operation=operation, operation_name=operation_name
7601                        )
7602                    elif operation_type == "sql":
7603                        self.calculation_process_sql(
7604                            operation=operation, operation_name=operation_name
7605                        )
7606                    else:
7607                        log.error(
7608                            f"Operations config: Type '{operation_type}' NOT available"
7609                        )
7610                        raise ValueError(
7611                            f"Operations config: Type '{operation_type}' NOT available"
7612                        )
7613                else:
7614                    log.error(
7615                        f"Operations config: Calculation '{operation_name}' NOT available"
7616                    )
7617                    raise ValueError(
7618                        f"Operations config: Calculation '{operation_name}' NOT available"
7619                    )
7620
7621        # Explode INFOS fields into table fields
7622        if self.get_explode_infos():
7623            self.explode_infos(
7624                prefix=self.get_explode_infos_prefix(),
7625                fields=self.get_explode_infos_fields(),
7626                force=True,
7627            )
7628
7629    def calculation_process_sql(
7630        self, operation: dict, operation_name: str = "unknown"
7631    ) -> None:
7632        """
7633        The `calculation_process_sql` function takes in a mathematical operation as a string and
7634        performs the operation, updating the specified table with the result.
7635
7636        :param operation: The `operation` parameter is a dictionary that contains information about the
7637        mathematical operation to be performed. It includes the following keys:
7638        :type operation: dict
7639        :param operation_name: The `operation_name` parameter is a string that represents the name of
7640        the mathematical operation being performed. It is used for logging and error handling purposes,
7641        defaults to unknown
7642        :type operation_name: str (optional)
7643        """
7644
7645        # table variants
7646        table_variants = self.get_table_variants(clause="alter")
7647
7648        # Operation infos
7649        operation_name = operation.get("name", "unknown")
7650        log.debug(f"process sql {operation_name}")
7651        output_column_name = operation.get("output_column_name", operation_name)
7652        output_column_type = operation.get("output_column_type", "String")
7653        prefix = operation.get("explode_infos_prefix", "")
7654        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
7655        output_column_description = operation.get(
7656            "output_column_description", f"{operation_name} operation"
7657        )
7658        operation_query = operation.get("operation_query", None)
7659        if isinstance(operation_query, list):
7660            operation_query = " ".join(operation_query)
7661        operation_info_fields = operation.get("info_fields", [])
7662        operation_info_fields_check = operation.get("info_fields_check", False)
7663        operation_info = operation.get("operation_info", True)
7664
7665        if operation_query:
7666
7667            # Info fields check
7668            operation_info_fields_check_result = True
7669            if operation_info_fields_check:
7670                header_infos = self.get_header().infos
7671                for info_field in operation_info_fields:
7672                    operation_info_fields_check_result = (
7673                        operation_info_fields_check_result
7674                        and info_field in header_infos
7675                    )
7676
7677            # If info fields available
7678            if operation_info_fields_check_result:
7679
7680                # Added_columns
7681                added_columns = []
7682
7683                # Create VCF header field
7684                vcf_reader = self.get_header()
7685                vcf_reader.infos[output_column_name] = vcf.parser._Info(
7686                    output_column_name,
7687                    ".",
7688                    output_column_type,
7689                    output_column_description,
7690                    "howard calculation",
7691                    "0",
7692                    self.code_type_map.get(output_column_type),
7693                )
7694
7695                # Explode infos if needed
7696                log.debug(f"calculation_process_sql prefix {prefix}")
7697                added_columns += self.explode_infos(
7698                    prefix=prefix,
7699                    fields=[output_column_name] + operation_info_fields,
7700                    force=True,
7701                )
7702
7703                # Create column
7704                added_column = self.add_column(
7705                    table_name=table_variants,
7706                    column_name=prefix + output_column_name,
7707                    column_type=output_column_type_sql,
7708                    default_value="null",
7709                )
7710                added_columns.append(added_column)
7711
7712                # Operation calculation
7713                try:
7714
7715                    # Query to update calculation column
7716                    sql_update = f"""
7717                        UPDATE {table_variants}
7718                        SET "{prefix}{output_column_name}" = ({operation_query})
7719                    """
7720                    self.conn.execute(sql_update)
7721
7722                    # Add to INFO
7723                    if operation_info:
7724                        sql_update_info = f"""
7725                            UPDATE {table_variants}
7726                            SET "INFO" =
7727                                concat(
7728                                    CASE
7729                                        WHEN "INFO" IS NOT NULL
7730                                        THEN concat("INFO", ';')
7731                                        ELSE ''
7732                                    END,
7733                                    '{output_column_name}=',
7734                                    "{prefix}{output_column_name}"
7735                                )
7736                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
7737                        """
7738                        self.conn.execute(sql_update_info)
7739
7740                except:
7741                    log.error(
7742                        f"Operations config: Calculation '{operation_name}' query failed"
7743                    )
7744                    raise ValueError(
7745                        f"Operations config: Calculation '{operation_name}' query failed"
7746                    )
7747
7748                # Remove added columns
7749                for added_column in added_columns:
7750                    log.debug(f"added_column: {added_column}")
7751                    self.drop_column(column=added_column)
7752
7753            else:
7754                log.error(
7755                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7756                )
7757                raise ValueError(
7758                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7759                )
7760
7761        else:
7762            log.error(
7763                f"Operations config: Calculation '{operation_name}' query NOT defined"
7764            )
7765            raise ValueError(
7766                f"Operations config: Calculation '{operation_name}' query NOT defined"
7767            )
7768
7769    def calculation_process_function(
7770        self, operation: dict, operation_name: str = "unknown"
7771    ) -> None:
7772        """
7773        The `calculation_process_function` takes in an operation dictionary and performs the specified
7774        function with the given parameters.
7775
7776        :param operation: The `operation` parameter is a dictionary that contains information about the
7777        operation to be performed. It has the following keys:
7778        :type operation: dict
7779        :param operation_name: The `operation_name` parameter is a string that represents the name of
7780        the operation being performed. It is used for logging purposes, defaults to unknown
7781        :type operation_name: str (optional)
7782        """
7783
7784        operation_name = operation["name"]
7785        log.debug(f"process sql {operation_name}")
7786        function_name = operation["function_name"]
7787        function_params = operation["function_params"]
7788        getattr(self, function_name)(*function_params)
7789
7790    def calculation_variant_id(self) -> None:
7791        """
7792        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
7793        updates the INFO field of a variants table with the variant ID.
7794        """
7795
7796        # variant_id annotation field
7797        variant_id_tag = self.get_variant_id_column()
7798        added_columns = [variant_id_tag]
7799
7800        # variant_id hgvs tags"
7801        vcf_infos_tags = {
7802            variant_id_tag: "howard variant ID annotation",
7803        }
7804
7805        # Variants table
7806        table_variants = self.get_table_variants()
7807
7808        # Header
7809        vcf_reader = self.get_header()
7810
7811        # Add variant_id to header
7812        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
7813            variant_id_tag,
7814            ".",
7815            "String",
7816            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
7817            "howard calculation",
7818            "0",
7819            self.code_type_map.get("String"),
7820        )
7821
7822        # Update
7823        sql_update = f"""
7824            UPDATE {table_variants}
7825            SET "INFO" = 
7826                concat(
7827                    CASE
7828                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
7829                        THEN ''
7830                        ELSE concat("INFO", ';')
7831                    END,
7832                    '{variant_id_tag}=',
7833                    "{variant_id_tag}"
7834                )
7835        """
7836        self.conn.execute(sql_update)
7837
7838        # Remove added columns
7839        for added_column in added_columns:
7840            self.drop_column(column=added_column)
7841
7842    def calculation_extract_snpeff_hgvs(
7843        self,
7844        snpeff_hgvs: str = "snpeff_hgvs",
7845        snpeff_field: str = "ANN",
7846    ) -> None:
7847        """
7848        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
7849        annotation field in a VCF file and adds them as a new column in the variants table.
7850
7851        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
7852        function is used to specify the name of the column that will store the HGVS nomenclatures
7853        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
7854        snpeff_hgvs
7855        :type snpeff_hgvs: str (optional)
7856        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
7857        function represents the field in the VCF file that contains SnpEff annotations. This field is
7858        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
7859        to ANN
7860        :type snpeff_field: str (optional)
7861        """
7862
7863        # Snpeff hgvs tags
7864        vcf_infos_tags = {
7865            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
7866        }
7867
7868        # Prefix
7869        prefix = self.get_explode_infos_prefix()
7870        if prefix:
7871            prefix = "INFO/"
7872
7873        # snpEff fields
7874        speff_ann_infos = prefix + snpeff_field
7875        speff_hgvs_infos = prefix + snpeff_hgvs
7876
7877        # Variants table
7878        table_variants = self.get_table_variants()
7879
7880        # Header
7881        vcf_reader = self.get_header()
7882
7883        # Add columns
7884        added_columns = []
7885
7886        # Explode HGVS field in column
7887        added_columns += self.explode_infos(fields=[snpeff_field])
7888
7889        if snpeff_field in vcf_reader.infos:
7890
7891            log.debug(vcf_reader.infos[snpeff_field])
7892
7893            # Extract ANN header
7894            ann_description = vcf_reader.infos[snpeff_field].desc
7895            pattern = r"'(.+?)'"
7896            match = re.search(pattern, ann_description)
7897            if match:
7898                ann_header_match = match.group(1).split(" | ")
7899                ann_header_desc = {}
7900                for i in range(len(ann_header_match)):
7901                    ann_header_info = "".join(
7902                        char for char in ann_header_match[i] if char.isalnum()
7903                    )
7904                    ann_header_desc[ann_header_info] = ann_header_match[i]
7905                if not ann_header_desc:
7906                    raise ValueError("Invalid header description format")
7907            else:
7908                raise ValueError("Invalid header description format")
7909
7910            # Create variant id
7911            variant_id_column = self.get_variant_id_column()
7912            added_columns += [variant_id_column]
7913
7914            # Create dataframe
7915            dataframe_snpeff_hgvs = self.get_query_to_df(
7916                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
7917            )
7918
7919            # Create main NOMEN column
7920            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
7921                speff_ann_infos
7922            ].apply(
7923                lambda x: extract_snpeff_hgvs(
7924                    str(x), header=list(ann_header_desc.values())
7925                )
7926            )
7927
7928            # Add snpeff_hgvs to header
7929            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
7930                snpeff_hgvs,
7931                ".",
7932                "String",
7933                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
7934                "howard calculation",
7935                "0",
7936                self.code_type_map.get("String"),
7937            )
7938
7939            # Update
7940            sql_update = f"""
7941                UPDATE variants
7942                SET "INFO" = 
7943                    concat(
7944                        CASE
7945                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
7946                            THEN ''
7947                            ELSE concat("INFO", ';')
7948                        END,
7949                        CASE 
7950                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
7951                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
7952                            THEN concat(
7953                                    '{snpeff_hgvs}=',
7954                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
7955                                )
7956                            ELSE ''
7957                        END
7958                    )
7959                FROM dataframe_snpeff_hgvs
7960                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
7961
7962            """
7963            self.conn.execute(sql_update)
7964
7965            # Delete dataframe
7966            del dataframe_snpeff_hgvs
7967            gc.collect()
7968
7969        else:
7970
7971            log.warning(
7972                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
7973            )
7974
7975        # Remove added columns
7976        for added_column in added_columns:
7977            self.drop_column(column=added_column)
7978
7979    def calculation_snpeff_ann_explode(
7980        self,
7981        uniquify: bool = True,
7982        output_format: str = "fields",
7983        output_prefix: str = "snpeff_",
7984        snpeff_field: str = "ANN",
7985    ) -> None:
7986        """
7987        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
7988        exploding the HGVS field and updating variant information accordingly.
7989
7990        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
7991        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
7992        it indicates that the output should be unique, meaning that duplicate entries should be removed,
7993        defaults to True
7994        :type uniquify: bool (optional)
7995        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
7996        function specifies the format in which the output annotations will be generated. It has a
7997        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
7998        format, defaults to fields
7999        :type output_format: str (optional)
8000        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
8001        method is used to specify the prefix that will be added to the output annotations generated
8002        during the calculation process. This prefix helps to differentiate the newly added annotations
8003        from existing ones in the output data. By default, the, defaults to ANN_
8004        :type output_prefix: str (optional)
8005        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
8006        function is used to specify the field in the VCF file that contains SnpEff annotations. This
8007        field will be processed to explode the HGVS annotations and update the variant information
8008        accordingly, defaults to ANN
8009        :type snpeff_field: str (optional)
8010        """
8011
8012        # SnpEff annotation field
8013        snpeff_hgvs = "snpeff_ann_explode"
8014
8015        # Snpeff hgvs tags
8016        vcf_infos_tags = {
8017            snpeff_hgvs: "Explode snpEff annotations",
8018        }
8019
8020        # Prefix
8021        prefix = self.get_explode_infos_prefix()
8022        if prefix:
8023            prefix = "INFO/"
8024
8025        # snpEff fields
8026        speff_ann_infos = prefix + snpeff_field
8027        speff_hgvs_infos = prefix + snpeff_hgvs
8028
8029        # Variants table
8030        table_variants = self.get_table_variants()
8031
8032        # Header
8033        vcf_reader = self.get_header()
8034
8035        # Add columns
8036        added_columns = []
8037
8038        # Explode HGVS field in column
8039        added_columns += self.explode_infos(fields=[snpeff_field])
8040        log.debug(f"snpeff_field={snpeff_field}")
8041        log.debug(f"added_columns={added_columns}")
8042
8043        if snpeff_field in vcf_reader.infos:
8044
8045            # Extract ANN header
8046            ann_description = vcf_reader.infos[snpeff_field].desc
8047            pattern = r"'(.+?)'"
8048            match = re.search(pattern, ann_description)
8049            if match:
8050                ann_header_match = match.group(1).split(" | ")
8051                ann_header = []
8052                ann_header_desc = {}
8053                for i in range(len(ann_header_match)):
8054                    ann_header_info = "".join(
8055                        char for char in ann_header_match[i] if char.isalnum()
8056                    )
8057                    ann_header.append(ann_header_info)
8058                    ann_header_desc[ann_header_info] = ann_header_match[i]
8059                if not ann_header_desc:
8060                    raise ValueError("Invalid header description format")
8061            else:
8062                raise ValueError("Invalid header description format")
8063
8064            # Create variant id
8065            variant_id_column = self.get_variant_id_column()
8066            added_columns += [variant_id_column]
8067
8068            # Create dataframe
8069            dataframe_snpeff_hgvs = self.get_query_to_df(
8070                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8071            )
8072
8073            # Create snpEff columns
8074            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8075                speff_ann_infos
8076            ].apply(
8077                lambda x: explode_snpeff_ann(
8078                    str(x),
8079                    uniquify=uniquify,
8080                    output_format=output_format,
8081                    prefix=output_prefix,
8082                    header=list(ann_header_desc.values()),
8083                )
8084            )
8085
8086            # Header
8087            ann_annotations_prefix = ""
8088            if output_format.upper() in ["JSON"]:
8089                ann_annotations_prefix = f"{output_prefix}="
8090                vcf_reader.infos[output_prefix] = vcf.parser._Info(
8091                    output_prefix,
8092                    ".",
8093                    "String",
8094                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8095                    + " - JSON format",
8096                    "howard calculation",
8097                    "0",
8098                    self.code_type_map.get("String"),
8099                )
8100            else:
8101                for ann_annotation in ann_header:
8102                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
8103                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
8104                        ann_annotation_id,
8105                        ".",
8106                        "String",
8107                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8108                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
8109                        "howard calculation",
8110                        "0",
8111                        self.code_type_map.get("String"),
8112                    )
8113
8114            # Update
8115            sql_update = f"""
8116                UPDATE variants
8117                SET "INFO" = 
8118                    concat(
8119                        CASE
8120                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8121                            THEN ''
8122                            ELSE concat("INFO", ';')
8123                        END,
8124                        CASE 
8125                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8126                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8127                            THEN concat(
8128                                '{ann_annotations_prefix}',
8129                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8130                                )
8131                            ELSE ''
8132                        END
8133                    )
8134                FROM dataframe_snpeff_hgvs
8135                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8136
8137            """
8138            self.conn.execute(sql_update)
8139
8140            # Delete dataframe
8141            del dataframe_snpeff_hgvs
8142            gc.collect()
8143
8144        else:
8145
8146            log.warning(
8147                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8148            )
8149
8150        # Remove added columns
8151        for added_column in added_columns:
8152            self.drop_column(column=added_column)
8153
8154    def calculation_extract_nomen(self) -> None:
8155        """
8156        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8157        """
8158
8159        # NOMEN field
8160        field_nomen_dict = "NOMEN_DICT"
8161
8162        # NOMEN structure
8163        nomen_dict = {
8164            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
8165            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
8166            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
8167            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
8168            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
8169            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
8170            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
8171            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
8172            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
8173            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
8174        }
8175
8176        # Param
8177        param = self.get_param()
8178
8179        # Prefix
8180        prefix = self.get_explode_infos_prefix()
8181
8182        # Header
8183        vcf_reader = self.get_header()
8184
8185        # Get HGVS field
8186        hgvs_field = (
8187            param.get("calculation", {})
8188            .get("calculations", {})
8189            .get("NOMEN", {})
8190            .get("options", {})
8191            .get("hgvs_field", "hgvs")
8192        )
8193
8194        # Get transcripts
8195        transcripts_file = (
8196            param.get("calculation", {})
8197            .get("calculations", {})
8198            .get("NOMEN", {})
8199            .get("options", {})
8200            .get("transcripts", None)
8201        )
8202        transcripts_file = full_path(transcripts_file)
8203        transcripts = []
8204        if transcripts_file:
8205            if os.path.exists(transcripts_file):
8206                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
8207                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
8208            else:
8209                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
8210                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
8211
8212        # Added columns
8213        added_columns = []
8214
8215        # Explode HGVS field in column
8216        added_columns += self.explode_infos(fields=[hgvs_field])
8217
8218        # extra infos
8219        extra_infos = self.get_extra_infos()
8220        extra_field = prefix + hgvs_field
8221
8222        if extra_field in extra_infos:
8223
8224            # Create dataframe
8225            dataframe_hgvs = self.get_query_to_df(
8226                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
8227            )
8228
8229            # Create main NOMEN column
8230            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
8231                lambda x: find_nomen(str(x), transcripts=transcripts)
8232            )
8233
8234            # Explode NOMEN Structure and create SQL set for update
8235            sql_nomen_fields = []
8236            for nomen_field in nomen_dict:
8237
8238                # Explode each field into a column
8239                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
8240                    lambda x: dict(x).get(nomen_field, "")
8241                )
8242
8243                # Create VCF header field
8244                vcf_reader.infos[nomen_field] = vcf.parser._Info(
8245                    nomen_field,
8246                    ".",
8247                    "String",
8248                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
8249                    "howard calculation",
8250                    "0",
8251                    self.code_type_map.get("String"),
8252                )
8253                sql_nomen_fields.append(
8254                    f"""
8255                        CASE 
8256                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
8257                            THEN concat(
8258                                    ';{nomen_field}=',
8259                                    dataframe_hgvs."{nomen_field}"
8260                                )
8261                            ELSE ''
8262                        END
8263                    """
8264                )
8265
8266            # SQL set for update
8267            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
8268
8269            # Update
8270            sql_update = f"""
8271                UPDATE variants
8272                SET "INFO" = 
8273                    concat(
8274                        CASE
8275                            WHEN "INFO" IS NULL
8276                            THEN ''
8277                            ELSE "INFO"
8278                        END,
8279                        {sql_nomen_fields_set}
8280                    )
8281                FROM dataframe_hgvs
8282                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
8283                    AND variants."POS" = dataframe_hgvs."POS" 
8284                    AND variants."REF" = dataframe_hgvs."REF"
8285                    AND variants."ALT" = dataframe_hgvs."ALT"
8286            """
8287            self.conn.execute(sql_update)
8288
8289            # Delete dataframe
8290            del dataframe_hgvs
8291            gc.collect()
8292
8293        # Remove added columns
8294        for added_column in added_columns:
8295            self.drop_column(column=added_column)
8296
8297    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
8298        """
8299        The function `calculation_find_by_pipeline` performs a calculation to find the number of
8300        pipeline/sample for a variant and updates the variant information in a VCF file.
8301
8302        :param tag: The `tag` parameter is a string that represents the annotation field for the
8303        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
8304        VCF header and to update the corresponding field in the variants table, defaults to
8305        findbypipeline
8306        :type tag: str (optional)
8307        """
8308
8309        # if FORMAT and samples
8310        if (
8311            "FORMAT" in self.get_header_columns_as_list()
8312            and self.get_header_sample_list()
8313        ):
8314
8315            # findbypipeline annotation field
8316            findbypipeline_tag = tag
8317
8318            # VCF infos tags
8319            vcf_infos_tags = {
8320                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
8321            }
8322
8323            # Prefix
8324            prefix = self.get_explode_infos_prefix()
8325
8326            # Field
8327            findbypipeline_infos = prefix + findbypipeline_tag
8328
8329            # Variants table
8330            table_variants = self.get_table_variants()
8331
8332            # Header
8333            vcf_reader = self.get_header()
8334
8335            # Create variant id
8336            variant_id_column = self.get_variant_id_column()
8337            added_columns = [variant_id_column]
8338
8339            # variant_id, FORMAT and samples
8340            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8341                self.get_header_sample_list()
8342            )
8343
8344            # Create dataframe
8345            dataframe_findbypipeline = self.get_query_to_df(
8346                f""" SELECT {samples_fields} FROM {table_variants} """
8347            )
8348
8349            # Create findbypipeline column
8350            dataframe_findbypipeline[findbypipeline_infos] = (
8351                dataframe_findbypipeline.apply(
8352                    lambda row: findbypipeline(
8353                        row, samples=self.get_header_sample_list()
8354                    ),
8355                    axis=1,
8356                )
8357            )
8358
8359            # Add snpeff_hgvs to header
8360            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
8361                findbypipeline_tag,
8362                ".",
8363                "String",
8364                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
8365                "howard calculation",
8366                "0",
8367                self.code_type_map.get("String"),
8368            )
8369
8370            # Update
8371            sql_update = f"""
8372                UPDATE variants
8373                SET "INFO" = 
8374                    concat(
8375                        CASE
8376                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8377                            THEN ''
8378                            ELSE concat("INFO", ';')
8379                        END,
8380                        CASE 
8381                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
8382                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
8383                            THEN concat(
8384                                    '{findbypipeline_tag}=',
8385                                    dataframe_findbypipeline."{findbypipeline_infos}"
8386                                )
8387                            ELSE ''
8388                        END
8389                    )
8390                FROM dataframe_findbypipeline
8391                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
8392            """
8393            self.conn.execute(sql_update)
8394
8395            # Remove added columns
8396            for added_column in added_columns:
8397                self.drop_column(column=added_column)
8398
8399            # Delete dataframe
8400            del dataframe_findbypipeline
8401            gc.collect()
8402
8403    def calculation_genotype_concordance(self) -> None:
8404        """
8405        The function `calculation_genotype_concordance` calculates the genotype concordance for
8406        multi-caller VCF files and updates the variant information in the database.
8407        """
8408
8409        # if FORMAT and samples
8410        if (
8411            "FORMAT" in self.get_header_columns_as_list()
8412            and self.get_header_sample_list()
8413        ):
8414
8415            # genotypeconcordance annotation field
8416            genotypeconcordance_tag = "genotypeconcordance"
8417
8418            # VCF infos tags
8419            vcf_infos_tags = {
8420                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
8421            }
8422
8423            # Prefix
8424            prefix = self.get_explode_infos_prefix()
8425
8426            # Field
8427            genotypeconcordance_infos = prefix + genotypeconcordance_tag
8428
8429            # Variants table
8430            table_variants = self.get_table_variants()
8431
8432            # Header
8433            vcf_reader = self.get_header()
8434
8435            # Create variant id
8436            variant_id_column = self.get_variant_id_column()
8437            added_columns = [variant_id_column]
8438
8439            # variant_id, FORMAT and samples
8440            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8441                self.get_header_sample_list()
8442            )
8443
8444            # Create dataframe
8445            dataframe_genotypeconcordance = self.get_query_to_df(
8446                f""" SELECT {samples_fields} FROM {table_variants} """
8447            )
8448
8449            # Create genotypeconcordance column
8450            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
8451                dataframe_genotypeconcordance.apply(
8452                    lambda row: genotypeconcordance(
8453                        row, samples=self.get_header_sample_list()
8454                    ),
8455                    axis=1,
8456                )
8457            )
8458
8459            # Add genotypeconcordance to header
8460            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
8461                genotypeconcordance_tag,
8462                ".",
8463                "String",
8464                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
8465                "howard calculation",
8466                "0",
8467                self.code_type_map.get("String"),
8468            )
8469
8470            # Update
8471            sql_update = f"""
8472                UPDATE variants
8473                SET "INFO" = 
8474                    concat(
8475                        CASE
8476                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8477                            THEN ''
8478                            ELSE concat("INFO", ';')
8479                        END,
8480                        CASE
8481                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
8482                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
8483                            THEN concat(
8484                                    '{genotypeconcordance_tag}=',
8485                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
8486                                )
8487                            ELSE ''
8488                        END
8489                    )
8490                FROM dataframe_genotypeconcordance
8491                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
8492            """
8493            self.conn.execute(sql_update)
8494
8495            # Remove added columns
8496            for added_column in added_columns:
8497                self.drop_column(column=added_column)
8498
8499            # Delete dataframe
8500            del dataframe_genotypeconcordance
8501            gc.collect()
8502
8503    def calculation_barcode(self, tag: str = "barcode") -> None:
8504        """
8505        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
8506        updates the INFO field in the file with the calculated barcode values.
8507        
8508        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
8509        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
8510        the default tag name is set to "barcode", defaults to barcode
8511        :type tag: str (optional)
8512        """
8513
8514        # if FORMAT and samples
8515        if (
8516            "FORMAT" in self.get_header_columns_as_list()
8517            and self.get_header_sample_list()
8518        ):
8519
8520            # barcode annotation field
8521            if not tag:
8522                tag = "barcode"
8523
8524            # VCF infos tags
8525            vcf_infos_tags = {
8526                tag: "barcode calculation (VaRank)",
8527            }
8528
8529            # Prefix
8530            prefix = self.get_explode_infos_prefix()
8531
8532            # Field
8533            barcode_infos = prefix + tag
8534
8535            # Variants table
8536            table_variants = self.get_table_variants()
8537
8538            # Header
8539            vcf_reader = self.get_header()
8540
8541            # Create variant id
8542            variant_id_column = self.get_variant_id_column()
8543            added_columns = [variant_id_column]
8544
8545            # variant_id, FORMAT and samples
8546            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8547                self.get_header_sample_list()
8548            )
8549
8550            # Create dataframe
8551            dataframe_barcode = self.get_query_to_df(
8552                f""" SELECT {samples_fields} FROM {table_variants} """
8553            )
8554
8555            # Create barcode column
8556            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8557                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
8558            )
8559
8560            # Add barcode to header
8561            vcf_reader.infos[tag] = vcf.parser._Info(
8562                tag,
8563                ".",
8564                "String",
8565                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
8566                "howard calculation",
8567                "0",
8568                self.code_type_map.get("String"),
8569            )
8570
8571            # Update
8572            sql_update = f"""
8573                UPDATE {table_variants}
8574                SET "INFO" = 
8575                    concat(
8576                        CASE
8577                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8578                            THEN ''
8579                            ELSE concat("INFO", ';')
8580                        END,
8581                        CASE
8582                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
8583                            AND dataframe_barcode."{barcode_infos}" NOT NULL
8584                            THEN concat(
8585                                    '{tag}=',
8586                                    dataframe_barcode."{barcode_infos}"
8587                                )
8588                            ELSE ''
8589                        END
8590                    )
8591                FROM dataframe_barcode
8592                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8593            """
8594            self.conn.execute(sql_update)
8595
8596            # Remove added columns
8597            for added_column in added_columns:
8598                self.drop_column(column=added_column)
8599
8600            # Delete dataframe
8601            del dataframe_barcode
8602            gc.collect()
8603
8604    def calculation_barcode_family(self, tag: str = "BCF") -> None:
8605        """
8606        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
8607        and updates the INFO field in the file with the calculated barcode values.
8608
8609        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
8610        the barcode tag that will be added to the VCF file during the calculation process. If no value
8611        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
8612        :type tag: str (optional)
8613        """
8614
8615        # if FORMAT and samples
8616        if (
8617            "FORMAT" in self.get_header_columns_as_list()
8618            and self.get_header_sample_list()
8619        ):
8620
8621            # barcode annotation field
8622            if not tag:
8623                tag = "BCF"
8624
8625            # VCF infos tags
8626            vcf_infos_tags = {
8627                tag: "barcode family calculation",
8628                f"{tag}S": "barcode family samples",
8629            }
8630
8631            # Param
8632            param = self.get_param()
8633            log.debug(f"param={param}")
8634
8635            # Prefix
8636            prefix = self.get_explode_infos_prefix()
8637
8638            # PED param
8639            ped = (
8640                param.get("calculation", {})
8641                .get("calculations", {})
8642                .get("BARCODEFAMILY", {})
8643                .get("family_pedigree", None)
8644            )
8645            log.debug(f"ped={ped}")
8646
8647            # Load PED
8648            if ped:
8649
8650                # Pedigree is a file
8651                if isinstance(ped, str) and os.path.exists(full_path(ped)):
8652                    log.debug("Pedigree is file")
8653                    with open(full_path(ped)) as ped:
8654                        ped = json.load(ped)
8655
8656                # Pedigree is a string
8657                elif isinstance(ped, str):
8658                    log.debug("Pedigree is str")
8659                    try:
8660                        ped = json.loads(ped)
8661                        log.debug("Pedigree is json str")
8662                    except ValueError as e:
8663                        ped_samples = ped.split(",")
8664                        ped = {}
8665                        for ped_sample in ped_samples:
8666                            ped[ped_sample] = ped_sample
8667
8668                # Pedigree is a dict
8669                elif isinstance(ped, dict):
8670                    log.debug("Pedigree is dict")
8671
8672                # Pedigree is not well formatted
8673                else:
8674                    msg_error = "Pedigree not well formatted"
8675                    log.error(msg_error)
8676                    raise ValueError(msg_error)
8677
8678                # Construct list
8679                ped_samples = list(ped.values())
8680
8681            else:
8682                log.debug("Pedigree not defined. Take all samples")
8683                ped_samples = self.get_header_sample_list()
8684                ped = {}
8685                for ped_sample in ped_samples:
8686                    ped[ped_sample] = ped_sample
8687
8688            # Check pedigree
8689            if not ped or len(ped) == 0:
8690                msg_error = f"Error in pedigree: samples {ped_samples}"
8691                log.error(msg_error)
8692                raise ValueError(msg_error)
8693
8694            # Log
8695            log.info(
8696                "Calculation 'BARCODEFAMILY' - Samples: "
8697                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
8698            )
8699            log.debug(f"ped_samples={ped_samples}")
8700
8701            # Field
8702            barcode_infos = prefix + tag
8703
8704            # Variants table
8705            table_variants = self.get_table_variants()
8706
8707            # Header
8708            vcf_reader = self.get_header()
8709
8710            # Create variant id
8711            variant_id_column = self.get_variant_id_column()
8712            added_columns = [variant_id_column]
8713
8714            # variant_id, FORMAT and samples
8715            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8716                ped_samples
8717            )
8718
8719            # Create dataframe
8720            dataframe_barcode = self.get_query_to_df(
8721                f""" SELECT {samples_fields} FROM {table_variants} """
8722            )
8723
8724            # Create barcode column
8725            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8726                lambda row: barcode(row, samples=ped_samples), axis=1
8727            )
8728
8729            # Add barcode family to header
8730            # Add vaf_normalization to header
8731            vcf_reader.formats[tag] = vcf.parser._Format(
8732                id=tag,
8733                num=".",
8734                type="String",
8735                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
8736                type_code=self.code_type_map.get("String"),
8737            )
8738            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
8739                id=f"{tag}S",
8740                num=".",
8741                type="String",
8742                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
8743                type_code=self.code_type_map.get("String"),
8744            )
8745
8746            # Update
8747            # for sample in ped_samples:
8748            sql_update_set = []
8749            for sample in self.get_header_sample_list() + ["FORMAT"]:
8750                if sample in ped_samples:
8751                    value = f'dataframe_barcode."{barcode_infos}"'
8752                    value_samples = "'" + ",".join(ped_samples) + "'"
8753                elif sample == "FORMAT":
8754                    value = f"'{tag}'"
8755                    value_samples = f"'{tag}S'"
8756                else:
8757                    value = "'.'"
8758                    value_samples = "'.'"
8759                format_regex = r"[a-zA-Z0-9\s]"
8760                sql_update_set.append(
8761                    f"""
8762                        "{sample}" = 
8763                        concat(
8764                            CASE
8765                                WHEN {table_variants}."{sample}" = './.'
8766                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
8767                                ELSE {table_variants}."{sample}"
8768                            END,
8769                            ':',
8770                            {value},
8771                            ':',
8772                            {value_samples}
8773                        )
8774                    """
8775                )
8776
8777            sql_update_set_join = ", ".join(sql_update_set)
8778            sql_update = f"""
8779                UPDATE {table_variants}
8780                SET {sql_update_set_join}
8781                FROM dataframe_barcode
8782                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8783            """
8784            self.conn.execute(sql_update)
8785
8786            # Remove added columns
8787            for added_column in added_columns:
8788                self.drop_column(column=added_column)
8789
8790            # Delete dataframe
8791            del dataframe_barcode
8792            gc.collect()
8793
8794    def calculation_trio(self) -> None:
8795        """
8796        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
8797        information to the INFO field of each variant.
8798        """
8799
8800        # if FORMAT and samples
8801        if (
8802            "FORMAT" in self.get_header_columns_as_list()
8803            and self.get_header_sample_list()
8804        ):
8805
8806            # trio annotation field
8807            trio_tag = "trio"
8808
8809            # VCF infos tags
8810            vcf_infos_tags = {
8811                "trio": "trio calculation",
8812            }
8813
8814            # Param
8815            param = self.get_param()
8816
8817            # Prefix
8818            prefix = self.get_explode_infos_prefix()
8819
8820            # Trio param
8821            trio_ped = (
8822                param.get("calculation", {})
8823                .get("calculations", {})
8824                .get("TRIO", {})
8825                .get("trio_pedigree", None)
8826            )
8827
8828            # Load trio
8829            if trio_ped:
8830
8831                # Trio pedigree is a file
8832                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
8833                    log.debug("TRIO pedigree is file")
8834                    with open(full_path(trio_ped)) as trio_ped:
8835                        trio_ped = json.load(trio_ped)
8836
8837                # Trio pedigree is a string
8838                elif isinstance(trio_ped, str):
8839                    log.debug("TRIO pedigree is str")
8840                    try:
8841                        trio_ped = json.loads(trio_ped)
8842                        log.debug("TRIO pedigree is json str")
8843                    except ValueError as e:
8844                        trio_samples = trio_ped.split(",")
8845                        if len(trio_samples) == 3:
8846                            trio_ped = {
8847                                "father": trio_samples[0],
8848                                "mother": trio_samples[1],
8849                                "child": trio_samples[2],
8850                            }
8851                            log.debug("TRIO pedigree is list str")
8852                        else:
8853                            msg_error = "TRIO pedigree not well formatted"
8854                            log.error(msg_error)
8855                            raise ValueError(msg_error)
8856
8857                # Trio pedigree is a dict
8858                elif isinstance(trio_ped, dict):
8859                    log.debug("TRIO pedigree is dict")
8860
8861                # Trio pedigree is not well formatted
8862                else:
8863                    msg_error = "TRIO pedigree not well formatted"
8864                    log.error(msg_error)
8865                    raise ValueError(msg_error)
8866
8867                # Construct trio list
8868                trio_samples = [
8869                    trio_ped.get("father", ""),
8870                    trio_ped.get("mother", ""),
8871                    trio_ped.get("child", ""),
8872                ]
8873
8874            else:
8875                log.debug("TRIO pedigree not defined. Take the first 3 samples")
8876                samples_list = self.get_header_sample_list()
8877                if len(samples_list) >= 3:
8878                    trio_samples = self.get_header_sample_list()[0:3]
8879                    trio_ped = {
8880                        "father": trio_samples[0],
8881                        "mother": trio_samples[1],
8882                        "child": trio_samples[2],
8883                    }
8884                else:
8885                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
8886                    log.error(msg_error)
8887                    raise ValueError(msg_error)
8888
8889            # Check trio pedigree
8890            if not trio_ped or len(trio_ped) != 3:
8891                msg_error = f"Error in TRIO pedigree: {trio_ped}"
8892                log.error(msg_error)
8893                raise ValueError(msg_error)
8894
8895            # Log
8896            log.info(
8897                f"Calculation 'TRIO' - Samples: "
8898                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
8899            )
8900
8901            # Field
8902            trio_infos = prefix + trio_tag
8903
8904            # Variants table
8905            table_variants = self.get_table_variants()
8906
8907            # Header
8908            vcf_reader = self.get_header()
8909
8910            # Create variant id
8911            variant_id_column = self.get_variant_id_column()
8912            added_columns = [variant_id_column]
8913
8914            # variant_id, FORMAT and samples
8915            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8916                self.get_header_sample_list()
8917            )
8918
8919            # Create dataframe
8920            dataframe_trio = self.get_query_to_df(
8921                f""" SELECT {samples_fields} FROM {table_variants} """
8922            )
8923
8924            # Create trio column
8925            dataframe_trio[trio_infos] = dataframe_trio.apply(
8926                lambda row: trio(row, samples=trio_samples), axis=1
8927            )
8928
8929            # Add trio to header
8930            vcf_reader.infos[trio_tag] = vcf.parser._Info(
8931                trio_tag,
8932                ".",
8933                "String",
8934                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
8935                "howard calculation",
8936                "0",
8937                self.code_type_map.get("String"),
8938            )
8939
8940            # Update
8941            sql_update = f"""
8942                UPDATE {table_variants}
8943                SET "INFO" = 
8944                    concat(
8945                        CASE
8946                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8947                            THEN ''
8948                            ELSE concat("INFO", ';')
8949                        END,
8950                        CASE
8951                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
8952                             AND dataframe_trio."{trio_infos}" NOT NULL
8953                            THEN concat(
8954                                    '{trio_tag}=',
8955                                    dataframe_trio."{trio_infos}"
8956                                )
8957                            ELSE ''
8958                        END
8959                    )
8960                FROM dataframe_trio
8961                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
8962            """
8963            self.conn.execute(sql_update)
8964
8965            # Remove added columns
8966            for added_column in added_columns:
8967                self.drop_column(column=added_column)
8968
8969            # Delete dataframe
8970            del dataframe_trio
8971            gc.collect()
8972
8973    def calculation_vaf_normalization(self) -> None:
8974        """
8975        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
8976        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
8977        :return: The function does not return anything.
8978        """
8979
8980        # if FORMAT and samples
8981        if (
8982            "FORMAT" in self.get_header_columns_as_list()
8983            and self.get_header_sample_list()
8984        ):
8985
8986            # vaf_normalization annotation field
8987            vaf_normalization_tag = "VAF"
8988
8989            # VCF infos tags
8990            vcf_infos_tags = {
8991                "VAF": "VAF Variant Frequency",
8992            }
8993
8994            # Prefix
8995            prefix = self.get_explode_infos_prefix()
8996
8997            # Variants table
8998            table_variants = self.get_table_variants()
8999
9000            # Header
9001            vcf_reader = self.get_header()
9002
9003            # Do not calculate if VAF already exists
9004            if "VAF" in vcf_reader.formats:
9005                log.debug("VAF already on genotypes")
9006                return
9007
9008            # Create variant id
9009            variant_id_column = self.get_variant_id_column()
9010            added_columns = [variant_id_column]
9011
9012            # variant_id, FORMAT and samples
9013            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9014                self.get_header_sample_list()
9015            )
9016
9017            # Create dataframe
9018            dataframe_vaf_normalization = self.get_query_to_df(
9019                f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
9020            )
9021
9022            vaf_normalization_set = []
9023
9024            # for each sample vaf_normalization
9025            for sample in self.get_header_sample_list():
9026                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
9027                    lambda row: vaf_normalization(row, sample=sample), axis=1
9028                )
9029                vaf_normalization_set.append(
9030                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
9031                )
9032
9033            # Add VAF to FORMAT
9034            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
9035                "FORMAT"
9036            ].apply(lambda x: str(x) + ":VAF")
9037            vaf_normalization_set.append(
9038                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
9039            )
9040
9041            # Add vaf_normalization to header
9042            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
9043                id=vaf_normalization_tag,
9044                num="1",
9045                type="Float",
9046                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
9047                type_code=self.code_type_map.get("Float"),
9048            )
9049
9050            # Create fields to add in INFO
9051            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
9052
9053            # Update
9054            sql_update = f"""
9055                UPDATE {table_variants}
9056                SET {sql_vaf_normalization_set}
9057                FROM dataframe_vaf_normalization
9058                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
9059
9060            """
9061            self.conn.execute(sql_update)
9062
9063            # Remove added columns
9064            for added_column in added_columns:
9065                self.drop_column(column=added_column)
9066
9067            # Delete dataframe
9068            del dataframe_vaf_normalization
9069            gc.collect()
9070
9071    def calculation_genotype_stats(self, info: str = "VAF") -> None:
9072        """
9073        The `calculation_genotype_stats` function calculates genotype statistics for a given information
9074        field in a VCF file and updates the INFO column of the variants table with the calculated
9075        statistics.
9076
9077        :param info: The `info` parameter is a string that represents the type of information for which
9078        genotype statistics are calculated. It is used to generate various VCF info tags for the
9079        statistics, such as the number of occurrences, the list of values, the minimum value, the
9080        maximum value, the mean, the median, defaults to VAF
9081        :type info: str (optional)
9082        """
9083
9084        # if FORMAT and samples
9085        if (
9086            "FORMAT" in self.get_header_columns_as_list()
9087            and self.get_header_sample_list()
9088        ):
9089
9090            # vaf_stats annotation field
9091            vaf_stats_tag = info + "_stats"
9092
9093            # VCF infos tags
9094            vcf_infos_tags = {
9095                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
9096                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
9097                info + "_stats_min": f"genotype {info} Statistics - min {info}",
9098                info + "_stats_max": f"genotype {info} Statistics - max {info}",
9099                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
9100                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
9101                info
9102                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
9103            }
9104
9105            # Prefix
9106            prefix = self.get_explode_infos_prefix()
9107
9108            # Field
9109            vaf_stats_infos = prefix + vaf_stats_tag
9110
9111            # Variants table
9112            table_variants = self.get_table_variants()
9113
9114            # Header
9115            vcf_reader = self.get_header()
9116
9117            # Create variant id
9118            variant_id_column = self.get_variant_id_column()
9119            added_columns = [variant_id_column]
9120
9121            # variant_id, FORMAT and samples
9122            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9123                self.get_header_sample_list()
9124            )
9125
9126            # Create dataframe
9127            dataframe_vaf_stats = self.get_query_to_df(
9128                f""" SELECT {samples_fields} FROM {table_variants} """
9129            )
9130
9131            # Create vaf_stats column
9132            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
9133                lambda row: genotype_stats(
9134                    row, samples=self.get_header_sample_list(), info=info
9135                ),
9136                axis=1,
9137            )
9138
9139            # List of vcf tags
9140            sql_vaf_stats_fields = []
9141
9142            # Check all VAF stats infos
9143            for stat in vcf_infos_tags:
9144
9145                # Extract stats
9146                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
9147                    lambda x: dict(x).get(stat, "")
9148                )
9149
9150                # Add snpeff_hgvs to header
9151                vcf_reader.infos[stat] = vcf.parser._Info(
9152                    stat,
9153                    ".",
9154                    "String",
9155                    vcf_infos_tags.get(stat, "genotype statistics"),
9156                    "howard calculation",
9157                    "0",
9158                    self.code_type_map.get("String"),
9159                )
9160
9161                if len(sql_vaf_stats_fields):
9162                    sep = ";"
9163                else:
9164                    sep = ""
9165
9166                # Create fields to add in INFO
9167                sql_vaf_stats_fields.append(
9168                    f"""
9169                        CASE
9170                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
9171                            THEN concat(
9172                                    '{sep}{stat}=',
9173                                    dataframe_vaf_stats."{stat}"
9174                                )
9175                            ELSE ''
9176                        END
9177                    """
9178                )
9179
9180            # SQL set for update
9181            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
9182
9183            # Update
9184            sql_update = f"""
9185                UPDATE variants
9186                SET "INFO" = 
9187                    concat(
9188                        CASE
9189                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9190                            THEN ''
9191                            ELSE concat("INFO", ';')
9192                        END,
9193                        {sql_vaf_stats_fields_set}
9194                    )
9195                FROM dataframe_vaf_stats
9196                WHERE variants."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
9197
9198            """
9199            self.conn.execute(sql_update)
9200
9201            # Remove added columns
9202            for added_column in added_columns:
9203                self.drop_column(column=added_column)
9204
9205            # Delete dataframe
9206            del dataframe_vaf_stats
9207            gc.collect()
class Variants:
  34class Variants:
  35
  36    def __init__(
  37        self,
  38        conn=None,
  39        input: str = None,
  40        output: str = None,
  41        config: dict = {},
  42        param: dict = {},
  43        load: bool = False,
  44    ) -> None:
  45        """
  46        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
  47        header
  48
  49        :param conn: the connection to the database
  50        :param input: the input file
  51        :param output: the output file
  52        :param config: a dictionary containing the configuration of the model
  53        :param param: a dictionary containing the parameters of the model
  54        """
  55
  56        # Init variables
  57        self.init_variables()
  58
  59        # Input
  60        self.set_input(input)
  61
  62        # Config
  63        self.set_config(config)
  64
  65        # Param
  66        self.set_param(param)
  67
  68        # Output
  69        self.set_output(output)
  70
  71        # connexion
  72        self.set_connexion(conn)
  73
  74        # Header
  75        self.set_header()
  76
  77        # Load data
  78        if load:
  79            self.load_data()
  80
  81    def set_input(self, input: str = None) -> None:
  82        """
  83        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  84        attributes in the class accordingly.
  85        
  86        :param input: The `set_input` method in the provided code snippet is used to set attributes
  87        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  88        :type input: str
  89        """
  90
  91        if input and not isinstance(input, str):
  92            try:
  93                self.input = input.name
  94            except:
  95                log.error(f"Input file '{input} in bad format")
  96                raise ValueError(f"Input file '{input} in bad format")
  97        else:
  98            self.input = input
  99
 100        # Input format
 101        if input:
 102            input_name, input_extension = os.path.splitext(self.input)
 103            self.input_name = input_name
 104            self.input_extension = input_extension
 105            self.input_format = self.input_extension.replace(".", "")
 106
 107    def set_config(self, config: dict) -> None:
 108        """
 109        The set_config function takes a config object and assigns it as the configuration object for the
 110        class.
 111        
 112        :param config: The `config` parameter in the `set_config` function is a dictionary object that
 113        contains configuration settings for the class. When you call the `set_config` function with a
 114        dictionary object as the argument, it will set that dictionary as the configuration object for
 115        the class
 116        :type config: dict
 117        """
 118
 119        self.config = config
 120
 121    def set_param(self, param: dict) -> None:
 122        """
 123        This function sets a parameter object for the class based on the input dictionary.
 124        
 125        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
 126        as the `param` attribute of the class instance
 127        :type param: dict
 128        """
 129
 130        self.param = param
 131
 132    def init_variables(self) -> None:
 133        """
 134        This function initializes the variables that will be used in the rest of the class
 135        """
 136
 137        self.prefix = "howard"
 138        self.table_variants = "variants"
 139        self.dataframe = None
 140
 141        self.comparison_map = {
 142            "gt": ">",
 143            "gte": ">=",
 144            "lt": "<",
 145            "lte": "<=",
 146            "equals": "=",
 147            "contains": "SIMILAR TO",
 148        }
 149
 150        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
 151
 152        self.code_type_map_to_sql = {
 153            "Integer": "INTEGER",
 154            "String": "VARCHAR",
 155            "Float": "FLOAT",
 156            "Flag": "VARCHAR",
 157        }
 158
 159        self.index_additionnal_fields = []
 160
 161    def get_indexing(self) -> bool:
 162        """
 163        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
 164        returns False.
 165        :return: The value of the indexing parameter.
 166        """
 167
 168        return self.get_param().get("indexing", False)
 169
 170    def get_connexion_config(self) -> dict:
 171        """
 172        The function `get_connexion_config` returns a dictionary containing the configuration for a
 173        connection, including the number of threads and memory limit.
 174        :return: a dictionary containing the configuration for the Connexion library.
 175        """
 176
 177        # config
 178        config = self.get_config()
 179
 180        # Connexion config
 181        connexion_config = {}
 182        threads = self.get_threads()
 183
 184        # Threads
 185        if threads:
 186            connexion_config["threads"] = threads
 187
 188        # Memory
 189        # if config.get("memory", None):
 190        #     connexion_config["memory_limit"] = config.get("memory")
 191        if self.get_memory():
 192            connexion_config["memory_limit"] = self.get_memory()
 193
 194        # Temporary directory
 195        if config.get("tmp", None):
 196            connexion_config["temp_directory"] = config.get("tmp")
 197
 198        # Access
 199        if config.get("access", None):
 200            access = config.get("access")
 201            if access in ["RO"]:
 202                access = "READ_ONLY"
 203            elif access in ["RW"]:
 204                access = "READ_WRITE"
 205            connexion_db = self.get_connexion_db()
 206            if connexion_db in ":memory:":
 207                access = "READ_WRITE"
 208            connexion_config["access_mode"] = access
 209
 210        return connexion_config
 211
 212    def get_duckdb_settings(self) -> dict:
 213        """
 214        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
 215        string.
 216        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
 217        """
 218
 219        # config
 220        config = self.get_config()
 221
 222        # duckdb settings
 223        duckdb_settings_dict = {}
 224        if config.get("duckdb_settings", None):
 225            duckdb_settings = config.get("duckdb_settings")
 226            duckdb_settings = full_path(duckdb_settings)
 227            # duckdb setting is a file
 228            if os.path.exists(duckdb_settings):
 229                with open(duckdb_settings) as json_file:
 230                    duckdb_settings_dict = yaml.safe_load(json_file)
 231            # duckdb settings is a string
 232            else:
 233                duckdb_settings_dict = json.loads(duckdb_settings)
 234
 235        return duckdb_settings_dict
 236
 237    def set_connexion_db(self) -> str:
 238        """
 239        The function `set_connexion_db` returns the appropriate database connection string based on the
 240        input format and connection type.
 241        :return: the value of the variable `connexion_db`.
 242        """
 243
 244        # Default connexion db
 245        default_connexion_db = ":memory:"
 246
 247        # Find connexion db
 248        if self.get_input_format() in ["db", "duckdb"]:
 249            connexion_db = self.get_input()
 250        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
 251            connexion_db = default_connexion_db
 252        elif self.get_connexion_type() in ["tmpfile"]:
 253            tmp_name = tempfile.mkdtemp(
 254                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
 255            )
 256            connexion_db = f"{tmp_name}/tmp.db"
 257        elif self.get_connexion_type() != "":
 258            connexion_db = self.get_connexion_type()
 259        else:
 260            connexion_db = default_connexion_db
 261
 262        # Set connexion db
 263        self.connexion_db = connexion_db
 264
 265        return connexion_db
 266
 267    def set_connexion(self, conn) -> None:
 268        """
 269        The function `set_connexion` creates a connection to a database, with options for different
 270        database formats and settings.
 271        
 272        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
 273        database. If a connection is not provided, a new connection to an in-memory database is created.
 274        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
 275        sqlite
 276        """
 277
 278        # Connexion db
 279        connexion_db = self.set_connexion_db()
 280
 281        # Connexion config
 282        connexion_config = self.get_connexion_config()
 283
 284        # Connexion format
 285        connexion_format = self.get_config().get("connexion_format", "duckdb")
 286        # Set connexion format
 287        self.connexion_format = connexion_format
 288
 289        # Connexion
 290        if not conn:
 291            if connexion_format in ["duckdb"]:
 292                conn = duckdb.connect(connexion_db, config=connexion_config)
 293                # duckDB settings
 294                duckdb_settings = self.get_duckdb_settings()
 295                if duckdb_settings:
 296                    for setting in duckdb_settings:
 297                        setting_value = duckdb_settings.get(setting)
 298                        if isinstance(setting_value, str):
 299                            setting_value = f"'{setting_value}'"
 300                        conn.execute(f"PRAGMA {setting}={setting_value};")
 301            elif connexion_format in ["sqlite"]:
 302                conn = sqlite3.connect(connexion_db)
 303
 304        # Set connexion
 305        self.conn = conn
 306
 307        # Log
 308        log.debug(f"connexion_format: {connexion_format}")
 309        log.debug(f"connexion_db: {connexion_db}")
 310        log.debug(f"connexion config: {connexion_config}")
 311        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
 312
 313    def set_output(self, output: str = None) -> None:
 314        """
 315        The `set_output` function in Python sets the output file based on the input or a specified key
 316        in the config file, extracting the output name, extension, and format.
 317        
 318        :param output: The `output` parameter in the `set_output` method is used to specify the name of
 319        the output file. If the config file has an 'output' key, the method sets the output to the value
 320        of that key. If no output is provided, it sets the output to `None`
 321        :type output: str
 322        """
 323
 324        if output and not isinstance(output, str):
 325            self.output = output.name
 326        else:
 327            self.output = output
 328
 329        # Output format
 330        if self.output:
 331            output_name, output_extension = os.path.splitext(self.output)
 332            self.output_name = output_name
 333            self.output_extension = output_extension
 334            self.output_format = self.output_extension.replace(".", "")
 335        else:
 336            self.output_name = None
 337            self.output_extension = None
 338            self.output_format = None
 339
 340    def set_header(self) -> None:
 341        """
 342        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
 343        """
 344
 345        input_file = self.get_input()
 346        default_header_list = [
 347            "##fileformat=VCFv4.2",
 348            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
 349        ]
 350
 351        # Full path
 352        input_file = full_path(input_file)
 353
 354        if input_file:
 355
 356            input_format = self.get_input_format()
 357            input_compressed = self.get_input_compressed()
 358            config = self.get_config()
 359            header_list = default_header_list
 360            if input_format in [
 361                "vcf",
 362                "hdr",
 363                "tsv",
 364                "csv",
 365                "psv",
 366                "parquet",
 367                "db",
 368                "duckdb",
 369            ]:
 370                # header provided in param
 371                if config.get("header_file", None):
 372                    with open(config.get("header_file"), "rt") as f:
 373                        header_list = self.read_vcf_header(f)
 374                # within a vcf file format (header within input file itsself)
 375                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
 376                    # within a compressed vcf file format (.vcf.gz)
 377                    if input_compressed:
 378                        with bgzf.open(input_file, "rt") as f:
 379                            header_list = self.read_vcf_header(f)
 380                    # within an uncompressed vcf file format (.vcf)
 381                    else:
 382                        with open(input_file, "rt") as f:
 383                            header_list = self.read_vcf_header(f)
 384                # header provided in default external file .hdr
 385                elif os.path.exists((input_file + ".hdr")):
 386                    with open(input_file + ".hdr", "rt") as f:
 387                        header_list = self.read_vcf_header(f)
 388                else:
 389                    try:  # Try to get header info fields and file columns
 390
 391                        with tempfile.TemporaryDirectory() as tmpdir:
 392
 393                            # Create database
 394                            db_for_header = Database(database=input_file)
 395
 396                            # Get header columns for infos fields
 397                            db_header_from_columns = (
 398                                db_for_header.get_header_from_columns()
 399                            )
 400
 401                            # Get real columns in the file
 402                            db_header_columns = db_for_header.get_columns()
 403
 404                            # Write header file
 405                            header_file_tmp = os.path.join(tmpdir, "header")
 406                            f = open(header_file_tmp, "w")
 407                            vcf.Writer(f, db_header_from_columns)
 408                            f.close()
 409
 410                            # Replace #CHROM line with rel columns
 411                            header_list = db_for_header.read_header_file(
 412                                header_file=header_file_tmp
 413                            )
 414                            header_list[-1] = "\t".join(db_header_columns)
 415
 416                    except:
 417
 418                        log.warning(
 419                            f"No header for file {input_file}. Set as default VCF header"
 420                        )
 421                        header_list = default_header_list
 422
 423            else:  # try for unknown format ?
 424
 425                log.error(f"Input file format '{input_format}' not available")
 426                raise ValueError(f"Input file format '{input_format}' not available")
 427
 428            if not header_list:
 429                header_list = default_header_list
 430
 431            # header as list
 432            self.header_list = header_list
 433
 434            # header as VCF object
 435            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
 436
 437        else:
 438
 439            self.header_list = None
 440            self.header_vcf = None
 441
 442    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
 443        """
 444        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
 445        DataFrame based on the connection format.
 446
 447        :param query: The `query` parameter in the `get_query_to_df` function is a string that
 448        represents the SQL query you want to execute. This query will be used to fetch data from a
 449        database and convert it into a pandas DataFrame
 450        :type query: str
 451        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
 452        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
 453        function will only fetch up to that number of rows from the database query result. If no limit
 454        is specified,
 455        :type limit: int
 456        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
 457        """
 458
 459        # Connexion format
 460        connexion_format = self.get_connexion_format()
 461
 462        # Limit in query
 463        if limit:
 464            pd.set_option("display.max_rows", limit)
 465            if connexion_format in ["duckdb"]:
 466                df = (
 467                    self.conn.execute(query)
 468                    .fetch_record_batch(limit)
 469                    .read_next_batch()
 470                    .to_pandas()
 471                )
 472            elif connexion_format in ["sqlite"]:
 473                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
 474
 475        # Full query
 476        else:
 477            if connexion_format in ["duckdb"]:
 478                df = self.conn.execute(query).df()
 479            elif connexion_format in ["sqlite"]:
 480                df = pd.read_sql_query(query, self.conn)
 481
 482        return df
 483
 484    def get_overview(self) -> None:
 485        """
 486        The function prints the input, output, config, and dataframe of the current object
 487        """
 488        table_variants_from = self.get_table_variants(clause="from")
 489        sql_columns = self.get_header_columns_as_sql()
 490        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
 491        df = self.get_query_to_df(sql_query_export)
 492        log.info(
 493            "Input:  "
 494            + str(self.get_input())
 495            + " ["
 496            + str(str(self.get_input_format()))
 497            + "]"
 498        )
 499        log.info(
 500            "Output: "
 501            + str(self.get_output())
 502            + " ["
 503            + str(str(self.get_output_format()))
 504            + "]"
 505        )
 506        log.info("Config: ")
 507        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
 508            "\n"
 509        ):
 510            log.info("\t" + str(d))
 511        log.info("Param: ")
 512        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
 513            "\n"
 514        ):
 515            log.info("\t" + str(d))
 516        log.info("Sample list: " + str(self.get_header_sample_list()))
 517        log.info("Dataframe: ")
 518        for d in str(df).split("\n"):
 519            log.info("\t" + str(d))
 520
 521        # garbage collector
 522        del df
 523        gc.collect()
 524
 525        return None
 526
 527    def get_stats(self) -> dict:
 528        """
 529        The `get_stats` function calculates and returns various statistics of the current object,
 530        including information about the input file, variants, samples, header fields, quality, and
 531        SNVs/InDels.
 532        :return: a dictionary containing various statistics of the current object. The dictionary has
 533        the following structure:
 534        """
 535
 536        # Log
 537        log.info(f"Stats Calculation...")
 538
 539        # table varaints
 540        table_variants_from = self.get_table_variants()
 541
 542        # stats dict
 543        stats = {"Infos": {}}
 544
 545        ### File
 546        input_file = self.get_input()
 547        stats["Infos"]["Input file"] = input_file
 548
 549        # Header
 550        header_infos = self.get_header().infos
 551        header_formats = self.get_header().formats
 552        header_infos_list = list(header_infos)
 553        header_formats_list = list(header_formats)
 554
 555        ### Variants
 556
 557        stats["Variants"] = {}
 558
 559        # Variants by chr
 560        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
 561        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
 562        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
 563            by=["CHROM"], kind="quicksort"
 564        )
 565
 566        # Total number of variants
 567        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
 568
 569        # Calculate percentage
 570        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
 571            lambda x: (x / nb_of_variants)
 572        )
 573
 574        stats["Variants"]["Number of variants by chromosome"] = (
 575            nb_of_variants_by_chrom.to_dict(orient="index")
 576        )
 577
 578        stats["Infos"]["Number of variants"] = int(nb_of_variants)
 579
 580        ### Samples
 581
 582        # Init
 583        samples = {}
 584        nb_of_samples = 0
 585
 586        # Check Samples
 587        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
 588            log.debug(f"Check samples...")
 589            for sample in self.get_header_sample_list():
 590                sql_query_samples = f"""
 591                    SELECT  '{sample}' as sample,
 592                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
 593                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
 594                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
 595                    FROM {table_variants_from}
 596                    WHERE (
 597                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
 598                        AND
 599                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
 600                      )
 601                    GROUP BY genotype
 602                    """
 603                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
 604                sample_genotype_count = sql_query_genotype_df["count"].sum()
 605                if len(sql_query_genotype_df):
 606                    nb_of_samples += 1
 607                    samples[f"{sample} - {sample_genotype_count} variants"] = (
 608                        sql_query_genotype_df.to_dict(orient="index")
 609                    )
 610
 611            stats["Samples"] = samples
 612            stats["Infos"]["Number of samples"] = nb_of_samples
 613
 614        # #
 615        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
 616        #     stats["Infos"]["Number of samples"] = nb_of_samples
 617        # elif nb_of_samples:
 618        #     stats["Infos"]["Number of samples"] = "not a VCF format"
 619
 620        ### INFO and FORMAT fields
 621        header_types_df = {}
 622        header_types_list = {
 623            "List of INFO fields": header_infos,
 624            "List of FORMAT fields": header_formats,
 625        }
 626        i = 0
 627        for header_type in header_types_list:
 628
 629            header_type_infos = header_types_list.get(header_type)
 630            header_infos_dict = {}
 631
 632            for info in header_type_infos:
 633
 634                i += 1
 635                header_infos_dict[i] = {}
 636
 637                # ID
 638                header_infos_dict[i]["id"] = info
 639
 640                # num
 641                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
 642                if header_type_infos[info].num in genotype_map.keys():
 643                    header_infos_dict[i]["Number"] = genotype_map.get(
 644                        header_type_infos[info].num
 645                    )
 646                else:
 647                    header_infos_dict[i]["Number"] = header_type_infos[info].num
 648
 649                # type
 650                if header_type_infos[info].type:
 651                    header_infos_dict[i]["Type"] = header_type_infos[info].type
 652                else:
 653                    header_infos_dict[i]["Type"] = "."
 654
 655                # desc
 656                if header_type_infos[info].desc != None:
 657                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
 658                else:
 659                    header_infos_dict[i]["Description"] = ""
 660
 661            if len(header_infos_dict):
 662                header_types_df[header_type] = pd.DataFrame.from_dict(
 663                    header_infos_dict, orient="index"
 664                ).to_dict(orient="index")
 665
 666        # Stats
 667        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
 668        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
 669        stats["Header"] = header_types_df
 670
 671        ### QUAL
 672        if "QUAL" in self.get_header_columns():
 673            sql_query_qual = f"""
 674                    SELECT
 675                        avg(CAST(QUAL AS INTEGER)) AS Average,
 676                        min(CAST(QUAL AS INTEGER)) AS Minimum,
 677                        max(CAST(QUAL AS INTEGER)) AS Maximum,
 678                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
 679                        median(CAST(QUAL AS INTEGER)) AS Median,
 680                        variance(CAST(QUAL AS INTEGER)) AS Variance
 681                    FROM {table_variants_from}
 682                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
 683                    """
 684
 685            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
 686            stats["Quality"] = {"Stats": qual}
 687
 688        ### SNV and InDel
 689
 690        sql_query_snv = f"""
 691            
 692            SELECT Type, count FROM (
 693
 694                    SELECT
 695                        'Total' AS Type,
 696                        count(*) AS count
 697                    FROM {table_variants_from}
 698
 699                    UNION
 700
 701                    SELECT
 702                        'MNV' AS Type,
 703                        count(*) AS count
 704                    FROM {table_variants_from}
 705                    WHERE len(REF) > 1 AND len(ALT) > 1
 706                    AND len(REF) = len(ALT)
 707
 708                    UNION
 709
 710                    SELECT
 711                        'InDel' AS Type,
 712                        count(*) AS count
 713                    FROM {table_variants_from}
 714                    WHERE len(REF) > 1 OR len(ALT) > 1
 715                    AND len(REF) != len(ALT)
 716                    
 717                    UNION
 718
 719                    SELECT
 720                        'SNV' AS Type,
 721                        count(*) AS count
 722                    FROM {table_variants_from}
 723                    WHERE len(REF) = 1 AND len(ALT) = 1
 724
 725                )
 726
 727            ORDER BY count DESC
 728
 729                """
 730        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
 731
 732        sql_query_snv_substitution = f"""
 733                SELECT
 734                    concat(REF, '>', ALT) AS 'Substitution',
 735                    count(*) AS count
 736                FROM {table_variants_from}
 737                WHERE len(REF) = 1 AND len(ALT) = 1
 738                GROUP BY REF, ALT
 739                ORDER BY count(*) DESC
 740                """
 741        snv_substitution = (
 742            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
 743        )
 744        stats["Variants"]["Counts"] = snv_indel
 745        stats["Variants"]["Substitutions"] = snv_substitution
 746
 747        return stats
 748
 749    def stats_to_file(self, file: str = None) -> str:
 750        """
 751        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
 752        into a JSON object, and writes the JSON object to the specified file.
 753
 754        :param file: The `file` parameter is a string that represents the file path where the JSON data
 755        will be written
 756        :type file: str
 757        :return: the name of the file that was written to.
 758        """
 759
 760        # Get stats
 761        stats = self.get_stats()
 762
 763        # Serializing json
 764        json_object = json.dumps(stats, indent=4)
 765
 766        # Writing to sample.json
 767        with open(file, "w") as outfile:
 768            outfile.write(json_object)
 769
 770        return file
 771
 772    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
 773        """
 774        The `print_stats` function generates a markdown file and prints the statistics contained in a
 775        JSON file in a formatted manner.
 776
 777        :param output_file: The `output_file` parameter is a string that specifies the path and filename
 778        of the output file where the stats will be printed in Markdown format. If no `output_file` is
 779        provided, a temporary directory will be created and the stats will be saved in a file named
 780        "stats.md" within that
 781        :type output_file: str
 782        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
 783        file where the statistics will be saved. If no value is provided, a temporary directory will be
 784        created and a default file name "stats.json" will be used
 785        :type json_file: str
 786        :return: The function `print_stats` does not return any value. It has a return type annotation
 787        of `None`.
 788        """
 789
 790        # Full path
 791        output_file = full_path(output_file)
 792        json_file = full_path(json_file)
 793
 794        with tempfile.TemporaryDirectory() as tmpdir:
 795
 796            # Files
 797            if not output_file:
 798                output_file = os.path.join(tmpdir, "stats.md")
 799            if not json_file:
 800                json_file = os.path.join(tmpdir, "stats.json")
 801
 802            # Create folders
 803            if not os.path.exists(os.path.dirname(output_file)):
 804                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
 805            if not os.path.exists(os.path.dirname(json_file)):
 806                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
 807
 808            # Create stats JSON file
 809            stats_file = self.stats_to_file(file=json_file)
 810
 811            # Print stats file
 812            with open(stats_file) as f:
 813                stats = yaml.safe_load(f)
 814
 815            # Output
 816            output_title = []
 817            output_index = []
 818            output = []
 819
 820            # Title
 821            output_title.append("# HOWARD Stats")
 822
 823            # Index
 824            output_index.append("## Index")
 825
 826            # Process sections
 827            for section in stats:
 828                infos = stats.get(section)
 829                section_link = "#" + section.lower().replace(" ", "-")
 830                output.append(f"## {section}")
 831                output_index.append(f"- [{section}]({section_link})")
 832
 833                if len(infos):
 834                    for info in infos:
 835                        try:
 836                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
 837                            is_df = True
 838                        except:
 839                            try:
 840                                df = pd.DataFrame.from_dict(
 841                                    json.loads((infos.get(info))), orient="index"
 842                                )
 843                                is_df = True
 844                            except:
 845                                is_df = False
 846                        if is_df:
 847                            output.append(f"### {info}")
 848                            info_link = "#" + info.lower().replace(" ", "-")
 849                            output_index.append(f"   - [{info}]({info_link})")
 850                            output.append(f"{df.to_markdown(index=False)}")
 851                        else:
 852                            output.append(f"- {info}: {infos.get(info)}")
 853                else:
 854                    output.append(f"NA")
 855
 856            # Write stats in markdown file
 857            with open(output_file, "w") as fp:
 858                for item in output_title:
 859                    fp.write("%s\n" % item)
 860                for item in output_index:
 861                    fp.write("%s\n" % item)
 862                for item in output:
 863                    fp.write("%s\n" % item)
 864
 865            # Output stats in markdown
 866            print("")
 867            print("\n\n".join(output_title))
 868            print("")
 869            print("\n\n".join(output))
 870            print("")
 871
 872        return None
 873
 874    def get_input(self) -> str:
 875        """
 876        It returns the value of the input variable.
 877        :return: The input is being returned.
 878        """
 879        return self.input
 880
 881    def get_input_format(self, input_file: str = None) -> str:
 882        """
 883        This function returns the format of the input variable, either from the provided input file or
 884        by prompting for input.
 885
 886        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
 887        represents the file path of the input file. If no `input_file` is provided when calling the
 888        method, it will default to `None`
 889        :type input_file: str
 890        :return: The format of the input variable is being returned.
 891        """
 892
 893        if not input_file:
 894            input_file = self.get_input()
 895        input_format = get_file_format(input_file)
 896        return input_format
 897
 898    def get_input_compressed(self, input_file: str = None) -> str:
 899        """
 900        The function `get_input_compressed` returns the format of the input variable after compressing
 901        it.
 902
 903        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
 904        that represents the file path of the input file. If no `input_file` is provided when calling the
 905        method, it will default to `None` and the method will then call `self.get_input()` to
 906        :type input_file: str
 907        :return: The function `get_input_compressed` returns the compressed format of the input
 908        variable.
 909        """
 910
 911        if not input_file:
 912            input_file = self.get_input()
 913        input_compressed = get_file_compressed(input_file)
 914        return input_compressed
 915
 916    def get_output(self) -> str:
 917        """
 918        It returns the output of the neuron.
 919        :return: The output of the neural network.
 920        """
 921
 922        return self.output
 923
 924    def get_output_format(self, output_file: str = None) -> str:
 925        """
 926        The function `get_output_format` returns the format of the input variable or the output file if
 927        provided.
 928
 929        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
 930        that represents the file path of the output file. If no `output_file` is provided when calling
 931        the method, it will default to the output obtained from the `get_output` method of the class
 932        instance. The
 933        :type output_file: str
 934        :return: The format of the input variable is being returned.
 935        """
 936
 937        if not output_file:
 938            output_file = self.get_output()
 939        output_format = get_file_format(output_file)
 940
 941        return output_format
 942
 943    def get_config(self) -> dict:
 944        """
 945        It returns the config
 946        :return: The config variable is being returned.
 947        """
 948        return self.config
 949
 950    def get_param(self) -> dict:
 951        """
 952        It returns the param
 953        :return: The param variable is being returned.
 954        """
 955        return self.param
 956
 957    def get_connexion_db(self) -> str:
 958        """
 959        It returns the connexion_db attribute of the object
 960        :return: The connexion_db is being returned.
 961        """
 962        return self.connexion_db
 963
 964    def get_prefix(self) -> str:
 965        """
 966        It returns the prefix of the object.
 967        :return: The prefix is being returned.
 968        """
 969        return self.prefix
 970
 971    def get_table_variants(self, clause: str = "select") -> str:
 972        """
 973        This function returns the table_variants attribute of the object
 974
 975        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 976        defaults to select (optional)
 977        :return: The table_variants attribute of the object.
 978        """
 979
 980        # Access
 981        access = self.get_config().get("access", None)
 982
 983        # Clauses "select", "where", "update"
 984        if clause in ["select", "where", "update"]:
 985            table_variants = self.table_variants
 986        # Clause "from"
 987        elif clause in ["from"]:
 988            # For Read Only
 989            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 990                input_file = self.get_input()
 991                table_variants = f"'{input_file}' as variants"
 992            # For Read Write
 993            else:
 994                table_variants = f"{self.table_variants} as variants"
 995        else:
 996            table_variants = self.table_variants
 997        return table_variants
 998
 999    def get_tmp_dir(self) -> str:
1000        """
1001        The function `get_tmp_dir` returns the temporary directory path based on configuration
1002        parameters or a default path.
1003        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
1004        configuration, parameters, and a default value of "/tmp".
1005        """
1006
1007        return get_tmp(
1008            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
1009        )
1010
1011    def get_connexion_type(self) -> str:
1012        """
1013        If the connexion type is not in the list of allowed connexion types, raise a ValueError
1014
1015        :return: The connexion type is being returned.
1016        """
1017        return self.get_config().get("connexion_type", "memory")
1018
1019    def get_connexion(self):
1020        """
1021        It returns the connection object
1022
1023        :return: The connection object.
1024        """
1025        return self.conn
1026
1027    def close_connexion(self) -> None:
1028        """
1029        This function closes the connection to the database.
1030        :return: The connection is being closed.
1031        """
1032        return self.conn.close()
1033
1034    def get_header(self, type: str = "vcf"):
1035        """
1036        This function returns the header of the VCF file as a list of strings
1037
1038        :param type: the type of header you want to get, defaults to vcf (optional)
1039        :return: The header of the vcf file.
1040        """
1041
1042        if self.header_vcf:
1043            if type == "vcf":
1044                return self.header_vcf
1045            elif type == "list":
1046                return self.header_list
1047        else:
1048            if type == "vcf":
1049                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
1050                return header
1051            elif type == "list":
1052                return vcf_required
1053
1054    def get_header_length(self, file: str = None) -> int:
1055        """
1056        The function `get_header_length` returns the length of the header list, excluding the #CHROM
1057        line.
1058
1059        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
1060        header file. If this argument is provided, the function will read the header from the specified
1061        file and return the length of the header list minus 1 (to exclude the #CHROM line)
1062        :type file: str
1063        :return: the length of the header list, excluding the #CHROM line.
1064        """
1065
1066        if file:
1067            return len(self.read_vcf_header_file(file=file)) - 1
1068        elif self.get_header(type="list"):
1069            return len(self.get_header(type="list")) - 1
1070        else:
1071            return 0
1072
1073    def get_header_columns(self) -> str:
1074        """
1075        This function returns the header list of a VCF
1076
1077        :return: The length of the header list.
1078        """
1079        if self.get_header():
1080            return self.get_header(type="list")[-1]
1081        else:
1082            return ""
1083
1084    def get_header_columns_as_list(self) -> list:
1085        """
1086        This function returns the header list of a VCF
1087
1088        :return: The length of the header list.
1089        """
1090        if self.get_header():
1091            return self.get_header_columns().strip().split("\t")
1092        else:
1093            return []
1094
1095    def get_header_columns_as_sql(self) -> str:
1096        """
1097        This function retruns header length (without #CHROM line)
1098
1099        :return: The length of the header list.
1100        """
1101        sql_column_list = []
1102        for col in self.get_header_columns_as_list():
1103            sql_column_list.append(f'"{col}"')
1104        return ",".join(sql_column_list)
1105
1106    def get_header_sample_list(self) -> list:
1107        """
1108        This function retruns header length (without #CHROM line)
1109
1110        :return: The length of the header list.
1111        """
1112        return self.header_vcf.samples
1113
1114    def get_verbose(self) -> bool:
1115        """
1116        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
1117        exist
1118
1119        :return: The value of the key "verbose" in the config dictionary.
1120        """
1121        return self.get_config().get("verbose", False)
1122
1123    def get_connexion_format(self) -> str:
1124        """
1125        It returns the connexion format of the object.
1126        :return: The connexion_format is being returned.
1127        """
1128        connexion_format = self.connexion_format
1129        if connexion_format not in ["duckdb", "sqlite"]:
1130            log.error(f"Unknown connexion format {connexion_format}")
1131            raise ValueError(f"Unknown connexion format {connexion_format}")
1132        else:
1133            return connexion_format
1134
1135    def insert_file_to_table(
1136        self,
1137        file,
1138        columns: str,
1139        header_len: int = 0,
1140        sep: str = "\t",
1141        chunksize: int = 1000000,
1142    ) -> None:
1143        """
1144        The function reads a file in chunks and inserts each chunk into a table based on the specified
1145        database format.
1146
1147        :param file: The `file` parameter is the file that you want to load into a table. It should be
1148        the path to the file on your system
1149        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
1150        should contain the names of the columns in the table where the data will be inserted. The column
1151        names should be separated by commas within the string. For example, if you have columns named
1152        "id", "name
1153        :type columns: str
1154        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
1155        the number of lines to skip at the beginning of the file before reading the actual data. This
1156        parameter allows you to skip any header information present in the file before processing the
1157        data, defaults to 0
1158        :type header_len: int (optional)
1159        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
1160        separator character that is used in the file being read. In this case, the default separator is
1161        set to `\t`, which represents a tab character. You can change this parameter to a different
1162        separator character if, defaults to \t
1163        :type sep: str (optional)
1164        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
1165        when processing the file in chunks. In the provided code snippet, the default value for
1166        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
1167        to 1000000
1168        :type chunksize: int (optional)
1169        """
1170
1171        # Config
1172        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
1173        connexion_format = self.get_connexion_format()
1174
1175        log.debug("chunksize: " + str(chunksize))
1176
1177        if chunksize:
1178            for chunk in pd.read_csv(
1179                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
1180            ):
1181                if connexion_format in ["duckdb"]:
1182                    sql_insert_into = (
1183                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
1184                    )
1185                    self.conn.execute(sql_insert_into)
1186                elif connexion_format in ["sqlite"]:
1187                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
1188
1189    def load_data(
1190        self,
1191        input_file: str = None,
1192        drop_variants_table: bool = False,
1193        sample_size: int = 20480,
1194    ) -> None:
1195        """
1196        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
1197        table before loading the data and specify a sample size.
1198
1199        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
1200        table
1201        :type input_file: str
1202        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
1203        determines whether the variants table should be dropped before loading the data. If set to
1204        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
1205        not be dropped, defaults to False
1206        :type drop_variants_table: bool (optional)
1207        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
1208        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
1209        20480
1210        :type sample_size: int (optional)
1211        """
1212
1213        log.info("Loading...")
1214
1215        # change input file
1216        if input_file:
1217            self.set_input(input_file)
1218            self.set_header()
1219
1220        # drop variants table
1221        if drop_variants_table:
1222            self.drop_variants_table()
1223
1224        # get table variants
1225        table_variants = self.get_table_variants()
1226
1227        # Access
1228        access = self.get_config().get("access", None)
1229        log.debug(f"access: {access}")
1230
1231        # Input format and compress
1232        input_format = self.get_input_format()
1233        input_compressed = self.get_input_compressed()
1234        log.debug(f"input_format: {input_format}")
1235        log.debug(f"input_compressed: {input_compressed}")
1236
1237        # input_compressed_format
1238        if input_compressed:
1239            input_compressed_format = "gzip"
1240        else:
1241            input_compressed_format = "none"
1242        log.debug(f"input_compressed_format: {input_compressed_format}")
1243
1244        # Connexion format
1245        connexion_format = self.get_connexion_format()
1246
1247        # Sample size
1248        if not sample_size:
1249            sample_size = -1
1250        log.debug(f"sample_size: {sample_size}")
1251
1252        # Load data
1253        log.debug(f"Load Data from {input_format}")
1254
1255        # DuckDB connexion
1256        if connexion_format in ["duckdb"]:
1257
1258            # Database already exists
1259            if self.input_format in ["db", "duckdb"]:
1260
1261                if connexion_format in ["duckdb"]:
1262                    log.debug(f"Input file format '{self.input_format}' duckDB")
1263                else:
1264                    log.error(
1265                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1266                    )
1267                    raise ValueError(
1268                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1269                    )
1270
1271            # Load from existing database format
1272            else:
1273
1274                try:
1275                    # Create Table or View
1276                    database = Database(database=self.input)
1277                    sql_from = database.get_sql_from(sample_size=sample_size)
1278
1279                    if access in ["RO"]:
1280                        sql_load = (
1281                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
1282                        )
1283                    else:
1284                        sql_load = (
1285                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
1286                        )
1287                    self.conn.execute(sql_load)
1288
1289                except:
1290                    # Format not available
1291                    log.error(f"Input file format '{self.input_format}' not available")
1292                    raise ValueError(
1293                        f"Input file format '{self.input_format}' not available"
1294                    )
1295
1296        # SQLite connexion
1297        elif connexion_format in ["sqlite"] and input_format in [
1298            "vcf",
1299            "tsv",
1300            "csv",
1301            "psv",
1302        ]:
1303
1304            # Main structure
1305            structure = {
1306                "#CHROM": "VARCHAR",
1307                "POS": "INTEGER",
1308                "ID": "VARCHAR",
1309                "REF": "VARCHAR",
1310                "ALT": "VARCHAR",
1311                "QUAL": "VARCHAR",
1312                "FILTER": "VARCHAR",
1313                "INFO": "VARCHAR",
1314            }
1315
1316            # Strcuture with samples
1317            structure_complete = structure
1318            if self.get_header_sample_list():
1319                structure["FORMAT"] = "VARCHAR"
1320                for sample in self.get_header_sample_list():
1321                    structure_complete[sample] = "VARCHAR"
1322
1323            # Columns list for create and insert
1324            sql_create_table_columns = []
1325            sql_create_table_columns_list = []
1326            for column in structure_complete:
1327                column_type = structure_complete[column]
1328                sql_create_table_columns.append(
1329                    f'"{column}" {column_type} default NULL'
1330                )
1331                sql_create_table_columns_list.append(f'"{column}"')
1332
1333            # Create database
1334            log.debug(f"Create Table {table_variants}")
1335            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
1336            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
1337            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
1338            self.conn.execute(sql_create_table)
1339
1340            # chunksize define length of file chunk load file
1341            chunksize = 100000
1342
1343            # delimiter
1344            delimiter = file_format_delimiters.get(input_format, "\t")
1345
1346            # Load the input file
1347            with open(self.input, "rt") as input_file:
1348
1349                # Use the appropriate file handler based on the input format
1350                if input_compressed:
1351                    input_file = bgzf.open(self.input, "rt")
1352                if input_format in ["vcf"]:
1353                    header_len = self.get_header_length()
1354                else:
1355                    header_len = 0
1356
1357                # Insert the file contents into a table
1358                self.insert_file_to_table(
1359                    input_file,
1360                    columns=sql_create_table_columns_list_sql,
1361                    header_len=header_len,
1362                    sep=delimiter,
1363                    chunksize=chunksize,
1364                )
1365
1366        else:
1367            log.error(
1368                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1369            )
1370            raise ValueError(
1371                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1372            )
1373
1374        # Explode INFOS fields into table fields
1375        if self.get_explode_infos():
1376            self.explode_infos(
1377                prefix=self.get_explode_infos_prefix(),
1378                fields=self.get_explode_infos_fields(),
1379                force=True,
1380            )
1381
1382        # Create index after insertion
1383        self.create_indexes()
1384
1385    def get_explode_infos(self) -> bool:
1386        """
1387        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
1388        to False if it is not set.
1389        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
1390        value. If the parameter is not present, it will return False.
1391        """
1392
1393        return self.get_param().get("explode", {}).get("explode_infos", False)
1394
1395    def get_explode_infos_fields(
1396        self,
1397        explode_infos_fields: str = None,
1398        remove_fields_not_in_header: bool = False,
1399    ) -> list:
1400        """
1401        The `get_explode_infos_fields` function returns a list of exploded information fields based on
1402        the input parameter `explode_infos_fields`.
1403
1404        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
1405        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
1406        comma-separated list of field names to explode
1407        :type explode_infos_fields: str
1408        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
1409        flag that determines whether to remove fields that are not present in the header. If it is set
1410        to `True`, any field that is not in the header will be excluded from the list of exploded
1411        information fields. If it is set to `, defaults to False
1412        :type remove_fields_not_in_header: bool (optional)
1413        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
1414        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
1415        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
1416        Otherwise, it returns a list of exploded information fields after removing any spaces and
1417        splitting the string by commas.
1418        """
1419
1420        # If no fields, get it in param
1421        if not explode_infos_fields:
1422            explode_infos_fields = (
1423                self.get_param().get("explode", {}).get("explode_infos_fields", None)
1424            )
1425
1426        # If no fields, defined as all fields in header using keyword
1427        if not explode_infos_fields:
1428            explode_infos_fields = "*"
1429
1430        # If fields list not empty
1431        if explode_infos_fields:
1432
1433            # Input fields list
1434            if isinstance(explode_infos_fields, str):
1435                fields_input = explode_infos_fields.split(",")
1436            elif isinstance(explode_infos_fields, list):
1437                fields_input = explode_infos_fields
1438            else:
1439                fields_input = []
1440
1441            # Fields list without * keyword
1442            fields_without_all = fields_input.copy()
1443            if "*".casefold() in (item.casefold() for item in fields_without_all):
1444                fields_without_all.remove("*")
1445
1446            # Fields in header
1447            fields_in_header = sorted(list(set(self.get_header().infos)))
1448
1449            # Construct list of fields
1450            fields_output = []
1451            for field in fields_input:
1452
1453                # Strip field
1454                field = field.strip()
1455
1456                # format keyword * in regex
1457                if field.upper() in ["*"]:
1458                    field = ".*"
1459
1460                # Find all fields with pattern
1461                r = re.compile(field)
1462                fields_search = sorted(list(filter(r.match, fields_in_header)))
1463
1464                # Remove fields input from search
1465                if fields_search != [field]:
1466                    fields_search = sorted(
1467                        list(set(fields_search).difference(fields_input))
1468                    )
1469
1470                # If field is not in header (avoid not well formatted header)
1471                if not fields_search and not remove_fields_not_in_header:
1472                    fields_search = [field]
1473
1474                # Add found fields
1475                for new_field in fields_search:
1476                    # Add field, if not already exists, and if it is in header (if asked)
1477                    if (
1478                        new_field not in fields_output
1479                        and (
1480                            not remove_fields_not_in_header
1481                            or new_field in fields_in_header
1482                        )
1483                        and new_field not in [".*"]
1484                    ):
1485                        fields_output.append(new_field)
1486
1487            return fields_output
1488
1489        else:
1490
1491            return []
1492
1493    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1494        """
1495        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
1496        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
1497        not provided.
1498
1499        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
1500        prefix to be used for exploding or expanding information
1501        :type explode_infos_prefix: str
1502        :return: the value of the variable `explode_infos_prefix`.
1503        """
1504
1505        if not explode_infos_prefix:
1506            explode_infos_prefix = (
1507                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
1508            )
1509
1510        return explode_infos_prefix
1511
1512    def add_column(
1513        self,
1514        table_name,
1515        column_name,
1516        column_type,
1517        default_value=None,
1518        drop: bool = False,
1519    ) -> dict:
1520        """
1521        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
1522        doesn't already exist.
1523
1524        :param table_name: The name of the table to which you want to add a column
1525        :param column_name: The parameter "column_name" is the name of the column that you want to add
1526        to the table
1527        :param column_type: The `column_type` parameter specifies the data type of the column that you
1528        want to add to the table. It should be a string that represents the desired data type, such as
1529        "INTEGER", "TEXT", "REAL", etc
1530        :param default_value: The `default_value` parameter is an optional parameter that specifies the
1531        default value for the newly added column. If a default value is provided, it will be assigned to
1532        the column for any existing rows that do not have a value for that column
1533        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
1534        if it already exists in the table. If `drop` is set to `True`, the function will drop the
1535        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
1536        to False
1537        :type drop: bool (optional)
1538        :return: a boolean value indicating whether the column was successfully added to the table.
1539        """
1540
1541        # added
1542        added = False
1543        dropped = False
1544
1545        # Check if the column already exists in the table
1546        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1547        columns = self.get_query_to_df(query).columns.tolist()
1548        if column_name in columns:
1549            log.debug(
1550                f"The {column_name} column already exists in the {table_name} table"
1551            )
1552            if drop:
1553                self.drop_column(table_name=table_name, column_name=column_name)
1554                dropped = True
1555            else:
1556                return None
1557        else:
1558            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1559
1560        # Add column in table
1561        add_column_query = (
1562            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
1563        )
1564        if default_value is not None:
1565            add_column_query += f" DEFAULT {default_value}"
1566        self.execute_query(add_column_query)
1567        added = not dropped
1568        log.debug(
1569            f"The {column_name} column was successfully added to the {table_name} table"
1570        )
1571
1572        if added:
1573            added_column = {
1574                "table_name": table_name,
1575                "column_name": column_name,
1576                "column_type": column_type,
1577                "default_value": default_value,
1578            }
1579        else:
1580            added_column = None
1581
1582        return added_column
1583
1584    def drop_column(
1585        self, column: dict = None, table_name: str = None, column_name: str = None
1586    ) -> bool:
1587        """
1588        The `drop_column` function drops a specified column from a given table in a database and returns
1589        True if the column was successfully dropped, and False if the column does not exist in the
1590        table.
1591
1592        :param column: The `column` parameter is a dictionary that contains information about the column
1593        you want to drop. It has two keys:
1594        :type column: dict
1595        :param table_name: The `table_name` parameter is the name of the table from which you want to
1596        drop a column
1597        :type table_name: str
1598        :param column_name: The `column_name` parameter is the name of the column that you want to drop
1599        from the table
1600        :type column_name: str
1601        :return: a boolean value. It returns True if the column was successfully dropped from the table,
1602        and False if the column does not exist in the table.
1603        """
1604
1605        # Find column infos
1606        if column:
1607            if isinstance(column, dict):
1608                table_name = column.get("table_name", None)
1609                column_name = column.get("column_name", None)
1610            elif isinstance(column, str):
1611                table_name = self.get_table_variants()
1612                column_name = column
1613            else:
1614                table_name = None
1615                column_name = None
1616
1617        if not table_name and not column_name:
1618            return False
1619
1620        # Removed
1621        removed = False
1622
1623        # Check if the column already exists in the table
1624        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1625        columns = self.get_query_to_df(query).columns.tolist()
1626        if column_name in columns:
1627            log.debug(f"The {column_name} column exists in the {table_name} table")
1628        else:
1629            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1630            return False
1631
1632        # Add column in table # ALTER TABLE integers DROP k
1633        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
1634        self.execute_query(add_column_query)
1635        removed = True
1636        log.debug(
1637            f"The {column_name} column was successfully dropped to the {table_name} table"
1638        )
1639
1640        return removed
1641
1642    def explode_infos(
1643        self,
1644        prefix: str = None,
1645        create_index: bool = False,
1646        fields: list = None,
1647        force: bool = False,
1648        proccess_all_fields_together: bool = False,
1649    ) -> list:
1650        """
1651        The `explode_infos` function takes a VCF file and explodes the INFO fields into individual
1652        columns, returning a list of added columns.
1653
1654        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
1655        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
1656        `self.get_explode_infos_prefix()` as the prefix
1657        :type prefix: str
1658        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
1659        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
1660        `False`, indexes will not be created. The default value is `False`, defaults to False
1661        :type create_index: bool (optional)
1662        :param fields: The `fields` parameter is a list of INFO fields that you want to explode into
1663        individual columns. If this parameter is not provided, all INFO fields will be exploded
1664        :type fields: list
1665        :param force: The `force` parameter is a boolean flag that determines whether to drop and
1666        recreate the column if it already exists in the table. If `force` is set to `True`, the column
1667        will be dropped and recreated. If `force` is set to `False`, the column will not be dropped,
1668        defaults to False
1669        :type force: bool (optional)
1670        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
1671        flag that determines whether to process all the INFO fields together or individually. If set to
1672        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
1673        be processed individually, defaults to False
1674        :type proccess_all_fields_together: bool (optional)
1675        :return: The function `explode_infos` returns a list of added columns.
1676        """
1677
1678        # drop indexes
1679        self.drop_indexes()
1680
1681        # connexion format
1682        connexion_format = self.get_connexion_format()
1683
1684        # Access
1685        access = self.get_config().get("access", None)
1686
1687        # Added columns
1688        added_columns = []
1689
1690        if access not in ["RO"]:
1691
1692            # prefix
1693            if prefix in [None, True] or not isinstance(prefix, str):
1694                if self.get_explode_infos_prefix() not in [None, True]:
1695                    prefix = self.get_explode_infos_prefix()
1696                else:
1697                    prefix = "INFO/"
1698
1699            # table variants
1700            table_variants = self.get_table_variants(clause="select")
1701
1702            # extra infos
1703            try:
1704                extra_infos = self.get_extra_infos()
1705            except:
1706                extra_infos = []
1707
1708            # Header infos
1709            header_infos = self.get_header().infos
1710
1711            log.debug(
1712                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
1713            )
1714
1715            sql_info_alter_table_array = []
1716
1717            # Info fields to check
1718            fields_list = list(header_infos)
1719            if fields:
1720                fields_list += fields
1721            fields_list = set(fields_list)
1722
1723            # If no fields
1724            if not fields:
1725                fields = []
1726
1727            # Translate fields if patterns
1728            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
1729
1730            for info in fields:
1731
1732                info_id_sql = prefix + info
1733
1734                if (
1735                    info in fields_list
1736                    or prefix + info in fields_list
1737                    or info in extra_infos
1738                ):
1739
1740                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
1741
1742                    if info in header_infos:
1743                        info_type = header_infos[info].type
1744                        info_num = header_infos[info].num
1745                    else:
1746                        info_type = "String"
1747                        info_num = 0
1748
1749                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
1750                    if info_num != 1:
1751                        type_sql = "VARCHAR"
1752
1753                    # Add field
1754                    added_column = self.add_column(
1755                        table_name=table_variants,
1756                        column_name=info_id_sql,
1757                        column_type=type_sql,
1758                        default_value="null",
1759                        drop=force,
1760                    )
1761
1762                    if added_column:
1763                        added_columns.append(added_column)
1764
1765                    if added_column or force:
1766
1767                        # add field to index
1768                        self.index_additionnal_fields.append(info_id_sql)
1769
1770                        # Update field array
1771                        if connexion_format in ["duckdb"]:
1772                            update_info_field = f"""
1773                            "{info_id_sql}" =
1774                                CASE
1775                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
1776                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
1777                                END
1778                            """
1779                        elif connexion_format in ["sqlite"]:
1780                            update_info_field = f"""
1781                                "{info_id_sql}" =
1782                                    CASE
1783                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
1784                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
1785                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
1786                                    END
1787                            """
1788
1789                        sql_info_alter_table_array.append(update_info_field)
1790
1791            if sql_info_alter_table_array:
1792
1793                # By chromosomes
1794                try:
1795                    chromosomes_list = list(
1796                        self.get_query_to_df(
1797                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
1798                        )["#CHROM"]
1799                    )
1800                except:
1801                    chromosomes_list = [None]
1802
1803                for chrom in chromosomes_list:
1804                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
1805
1806                    # Where clause
1807                    where_clause = ""
1808                    if chrom and len(chromosomes_list) > 1:
1809                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
1810
1811                    # Update table
1812                    if proccess_all_fields_together:
1813                        sql_info_alter_table_array_join = ", ".join(
1814                            sql_info_alter_table_array
1815                        )
1816                        if sql_info_alter_table_array_join:
1817                            sql_info_alter_table = f"""
1818                                UPDATE {table_variants}
1819                                SET {sql_info_alter_table_array_join}
1820                                {where_clause}
1821                                """
1822                            log.debug(
1823                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
1824                            )
1825                            # log.debug(sql_info_alter_table)
1826                            self.conn.execute(sql_info_alter_table)
1827                    else:
1828                        sql_info_alter_num = 0
1829                        for sql_info_alter in sql_info_alter_table_array:
1830                            sql_info_alter_num += 1
1831                            sql_info_alter_table = f"""
1832                                UPDATE {table_variants}
1833                                SET {sql_info_alter}
1834                                {where_clause}
1835                                """
1836                            log.debug(
1837                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
1838                            )
1839                            # log.debug(sql_info_alter_table)
1840                            self.conn.execute(sql_info_alter_table)
1841
1842        # create indexes
1843        if create_index:
1844            self.create_indexes()
1845
1846        return added_columns
1847
1848    def create_indexes(self) -> None:
1849        """
1850        Create indexes on the table after insertion
1851        """
1852
1853        # Access
1854        access = self.get_config().get("access", None)
1855
1856        # get table variants
1857        table_variants = self.get_table_variants("FROM")
1858
1859        if self.get_indexing() and access not in ["RO"]:
1860            # Create index
1861            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
1862            self.conn.execute(sql_create_table_index)
1863            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
1864            self.conn.execute(sql_create_table_index)
1865            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
1866            self.conn.execute(sql_create_table_index)
1867            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
1868            self.conn.execute(sql_create_table_index)
1869            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
1870            self.conn.execute(sql_create_table_index)
1871            for field in self.index_additionnal_fields:
1872                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
1873                self.conn.execute(sql_create_table_index)
1874
1875    def drop_indexes(self) -> None:
1876        """
1877        Create indexes on the table after insertion
1878        """
1879
1880        # Access
1881        access = self.get_config().get("access", None)
1882
1883        # get table variants
1884        table_variants = self.get_table_variants("FROM")
1885
1886        # Get database format
1887        connexion_format = self.get_connexion_format()
1888
1889        if access not in ["RO"]:
1890            if connexion_format in ["duckdb"]:
1891                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
1892            elif connexion_format in ["sqlite"]:
1893                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
1894
1895            list_indexes = self.conn.execute(sql_list_indexes)
1896            index_names = [row[0] for row in list_indexes.fetchall()]
1897            for index in index_names:
1898                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
1899                self.conn.execute(sql_drop_table_index)
1900
1901    def read_vcf_header(self, f) -> list:
1902        """
1903        It reads the header of a VCF file and returns a list of the header lines
1904
1905        :param f: the file object
1906        :return: The header lines of the VCF file.
1907        """
1908
1909        header_list = []
1910        for line in f:
1911            header_list.append(line)
1912            if line.startswith("#CHROM"):
1913                break
1914        return header_list
1915
1916    def read_vcf_header_file(self, file: str = None) -> list:
1917        """
1918        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
1919        uncompressed files.
1920
1921        :param file: The `file` parameter is a string that represents the path to the VCF header file
1922        that you want to read. It is an optional parameter, so if you don't provide a value, it will
1923        default to `None`
1924        :type file: str
1925        :return: The function `read_vcf_header_file` returns a list.
1926        """
1927
1928        if self.get_input_compressed(input_file=file):
1929            with bgzf.open(file, "rt") as f:
1930                return self.read_vcf_header(f=f)
1931        else:
1932            with open(file, "rt") as f:
1933                return self.read_vcf_header(f=f)
1934
1935    def execute_query(self, query: str):
1936        """
1937        It takes a query as an argument, executes it, and returns the results
1938
1939        :param query: The query to be executed
1940        :return: The result of the query is being returned.
1941        """
1942        if query:
1943            return self.conn.execute(query)  # .fetchall()
1944        else:
1945            return None
1946
1947    def export_output(
1948        self,
1949        output_file: str | None = None,
1950        output_header: str | None = None,
1951        export_header: bool = True,
1952        query: str | None = None,
1953        parquet_partitions: list | None = None,
1954        chunk_size: int | None = None,
1955        threads: int | None = None,
1956        sort: bool = False,
1957        index: bool = False,
1958        order_by: str | None = None,
1959    ) -> bool:
1960        """
1961        The `export_output` function exports data from a VCF file to a specified output file in various
1962        formats, including VCF, CSV, TSV, PSV, and Parquet.
1963
1964        :param output_file: The `output_file` parameter is a string that specifies the name of the
1965        output file to be generated by the function. This is where the exported data will be saved
1966        :type output_file: str
1967        :param output_header: The `output_header` parameter is a string that specifies the name of the
1968        file where the header of the VCF file will be exported. If this parameter is not provided, the
1969        header will be exported to a file with the same name as the `output_file` parameter, but with
1970        the extension "
1971        :type output_header: str
1972        :param export_header: The `export_header` parameter is a boolean flag that determines whether
1973        the header of a VCF file should be exported to a separate file or not. If `export_header` is
1974        True, the header will be exported to a file. If `export_header` is False, the header will not
1975        be, defaults to True, if output format is not VCF
1976        :type export_header: bool (optional)
1977        :param query: The `query` parameter is an optional SQL query that can be used to filter and
1978        select specific data from the VCF file before exporting it. If provided, only the data that
1979        matches the query will be exported
1980        :type query: str
1981        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
1982        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
1983        organize data in a hierarchical directory structure based on the values of one or more columns.
1984        This can improve query performance when working with large datasets
1985        :type parquet_partitions: list
1986        :param chunk_size: The `chunk_size` parameter specifies the number of
1987        records in batch when exporting data in Parquet format. This parameter is used for
1988        partitioning the Parquet file into multiple files.
1989        :type chunk_size: int
1990        :param threads: The `threads` parameter is an optional parameter that specifies the number of
1991        threads to be used during the export process. It determines the level of parallelism and can
1992        improve the performance of the export operation. If not provided, the function will use the
1993        default number of threads
1994        :type threads: int
1995        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
1996        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
1997        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
1998        False
1999        :type sort: bool (optional)
2000        :param index: The `index` parameter is a boolean flag that determines whether an index should be
2001        created on the output file. If `index` is True, an index will be created. If `index` is False,
2002        no index will be created. The default value is False, defaults to False
2003        :type index: bool (optional)
2004        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
2005        sorting the output file. This parameter is only applicable when exporting data in VCF format
2006        :type order_by: str
2007        :return: a boolean value. It checks if the output file exists and returns True if it does, or
2008        None if it doesn't.
2009        """
2010
2011        # Log
2012        log.info("Exporting...")
2013
2014        # Full path
2015        output_file = full_path(output_file)
2016        output_header = full_path(output_header)
2017
2018        # Config
2019        config = self.get_config()
2020
2021        # Param
2022        param = self.get_param()
2023
2024        # Tmp files to remove
2025        tmp_to_remove = []
2026
2027        # If no output, get it
2028        if not output_file:
2029            output_file = self.get_output()
2030
2031        # If not threads
2032        if not threads:
2033            threads = self.get_threads()
2034
2035        # Auto header name with extension
2036        if export_header or output_header:
2037            if not output_header:
2038                output_header = f"{output_file}.hdr"
2039            # Export header
2040            self.export_header(output_file=output_file)
2041
2042        # Switch off export header if VCF output
2043        output_file_type = get_file_format(output_file)
2044        if output_file_type in ["vcf"]:
2045            export_header = False
2046            tmp_to_remove.append(output_header)
2047
2048        # Chunk size
2049        if not chunk_size:
2050            chunk_size = config.get("chunk_size", None)
2051
2052        # Parquet partition
2053        if not parquet_partitions:
2054            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
2055        if parquet_partitions and isinstance(parquet_partitions, str):
2056            parquet_partitions = parquet_partitions.split(",")
2057
2058        # Order by
2059        if not order_by:
2060            order_by = param.get("export", {}).get("order_by", "")
2061
2062        # Header in output
2063        header_in_output = param.get("export", {}).get("include_header", False)
2064
2065        # Database
2066        database_source = self.get_connexion()
2067
2068        # Connexion format
2069        connexion_format = self.get_connexion_format()
2070
2071        # Explode infos
2072        if self.get_explode_infos():
2073            self.explode_infos(
2074                prefix=self.get_explode_infos_prefix(),
2075                fields=self.get_explode_infos_fields(),
2076                force=False,
2077            )
2078
2079        # if connexion_format in ["sqlite"] or query:
2080        if connexion_format in ["sqlite"]:
2081
2082            # Export in Parquet
2083            random_tmp = "".join(
2084                random.choice(string.ascii_lowercase) for i in range(10)
2085            )
2086            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
2087            tmp_to_remove.append(database_source)
2088
2089            # Table Variants
2090            table_variants = self.get_table_variants()
2091
2092            # Create export query
2093            sql_query_export_subquery = f"""
2094                SELECT * FROM {table_variants}
2095                """
2096
2097            # Write source file
2098            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
2099
2100        # Create database
2101        database = Database(
2102            database=database_source,
2103            table="variants",
2104            header_file=output_header,
2105            conn_config=self.get_connexion_config(),
2106        )
2107
2108        # Existing colomns header
2109        # existing_columns_header = database.get_header_file_columns(output_header)
2110        existing_columns_header = database.get_header_columns_from_database()
2111
2112        # Export file
2113        database.export(
2114            output_database=output_file,
2115            output_header=output_header,
2116            existing_columns_header=existing_columns_header,
2117            parquet_partitions=parquet_partitions,
2118            chunk_size=chunk_size,
2119            threads=threads,
2120            sort=sort,
2121            index=index,
2122            header_in_output=header_in_output,
2123            order_by=order_by,
2124            query=query,
2125            export_header=export_header,
2126        )
2127
2128        # Remove
2129        remove_if_exists(tmp_to_remove)
2130
2131        return (os.path.exists(output_file) or None) and (
2132            os.path.exists(output_file) or None
2133        )
2134
2135    def get_extra_infos(self, table: str = None) -> list:
2136        """
2137        The `get_extra_infos` function returns a list of columns that are in a specified table but not
2138        in the header.
2139
2140        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
2141        name of the table from which you want to retrieve the extra columns that are not present in the
2142        header. If the `table` parameter is not provided when calling the function, it will default to
2143        using the variants
2144        :type table: str
2145        :return: A list of columns that are in the specified table but not in the header of the table.
2146        """
2147
2148        header_columns = []
2149
2150        if not table:
2151            table = self.get_table_variants(clause="from")
2152            header_columns = self.get_header_columns()
2153
2154        # Check all columns in the database
2155        query = f""" SELECT * FROM {table} LIMIT 1 """
2156        log.debug(f"query {query}")
2157        table_columns = self.get_query_to_df(query).columns.tolist()
2158        extra_columns = []
2159
2160        # Construct extra infos (not in header)
2161        for column in table_columns:
2162            if column not in header_columns:
2163                extra_columns.append(column)
2164
2165        return extra_columns
2166
2167    def get_extra_infos_sql(self, table: str = None) -> str:
2168        """
2169        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
2170        by double quotes
2171
2172        :param table: The name of the table to get the extra infos from. If None, the default table is
2173        used
2174        :type table: str
2175        :return: A string of the extra infos
2176        """
2177
2178        return ", ".join(
2179            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
2180        )
2181
2182    def export_header(
2183        self,
2184        header_name: str = None,
2185        output_file: str = None,
2186        output_file_ext: str = ".hdr",
2187        clean_header: bool = True,
2188        remove_chrom_line: bool = False,
2189    ) -> str:
2190        """
2191        The `export_header` function takes a VCF file, extracts the header, modifies it according to
2192        specified options, and writes it to a new file.
2193
2194        :param header_name: The `header_name` parameter is the name of the header file to be created. If
2195        this parameter is not specified, the header will be written to the output file
2196        :type header_name: str
2197        :param output_file: The `output_file` parameter in the `export_header` function is used to
2198        specify the name of the output file where the header will be written. If this parameter is not
2199        provided, the header will be written to a temporary file
2200        :type output_file: str
2201        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
2202        string that represents the extension of the output header file. By default, it is set to ".hdr"
2203        if not specified by the user. This extension will be appended to the `output_file` name to
2204        create the final, defaults to .hdr
2205        :type output_file_ext: str (optional)
2206        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
2207        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
2208        `True`, the function will clean the header by modifying certain lines based on a specific
2209        pattern. If `clean_header`, defaults to True
2210        :type clean_header: bool (optional)
2211        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
2212        boolean flag that determines whether the #CHROM line should be removed from the header before
2213        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
2214        defaults to False
2215        :type remove_chrom_line: bool (optional)
2216        :return: The function `export_header` returns the name of the temporary header file that is
2217        created.
2218        """
2219
2220        if not header_name and not output_file:
2221            output_file = self.get_output()
2222
2223        if self.get_header():
2224
2225            # Get header object
2226            header_obj = self.get_header()
2227
2228            # Create database
2229            db_for_header = Database(database=self.get_input())
2230
2231            # Get real columns in the file
2232            db_header_columns = db_for_header.get_columns()
2233
2234            with tempfile.TemporaryDirectory() as tmpdir:
2235
2236                # Write header file
2237                header_file_tmp = os.path.join(tmpdir, "header")
2238                f = open(header_file_tmp, "w")
2239                vcf.Writer(f, header_obj)
2240                f.close()
2241
2242                # Replace #CHROM line with rel columns
2243                header_list = db_for_header.read_header_file(
2244                    header_file=header_file_tmp
2245                )
2246                header_list[-1] = "\t".join(db_header_columns)
2247
2248                # Remove CHROM line
2249                if remove_chrom_line:
2250                    header_list.pop()
2251
2252                # Clean header
2253                if clean_header:
2254                    header_list_clean = []
2255                    for head in header_list:
2256                        # Clean head for malformed header
2257                        head_clean = head
2258                        head_clean = re.subn(
2259                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
2260                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
2261                            head_clean,
2262                            2,
2263                        )[0]
2264                        # Write header
2265                        header_list_clean.append(head_clean)
2266                    header_list = header_list_clean
2267
2268            tmp_header_name = output_file + output_file_ext
2269
2270            f = open(tmp_header_name, "w")
2271            for line in header_list:
2272                f.write(line)
2273            f.close()
2274
2275        return tmp_header_name
2276
2277    def export_variant_vcf(
2278        self,
2279        vcf_file,
2280        remove_info: bool = False,
2281        add_samples: bool = True,
2282        list_samples: list = [],
2283        where_clause: str = "",
2284        index: bool = False,
2285        threads: int | None = None,
2286    ) -> bool | None:
2287        """
2288        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
2289        remove INFO field, add samples, and control compression and indexing.
2290
2291        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
2292        written to. It is the output file that will contain the filtered VCF data based on the specified
2293        parameters
2294        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
2295        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
2296        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
2297        in, defaults to False
2298        :type remove_info: bool (optional)
2299        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
2300        the samples should be added to the VCF file or not. If set to True, the samples will be added.
2301        If set to False, the samples will be removed. The default value is True, defaults to True
2302        :type add_samples: bool (optional)
2303        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
2304        in the output VCF file. By default, all samples will be included. If you provide a list of
2305        samples, only those samples will be included in the output file
2306        :type list_samples: list
2307        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
2308        determines whether or not to create an index for the output VCF file. If `index` is set to
2309        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
2310        :type index: bool (optional)
2311        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
2312        number of threads to use for exporting the VCF file. It determines how many parallel threads
2313        will be used during the export process. More threads can potentially speed up the export process
2314        by utilizing multiple cores of the processor. If
2315        :type threads: int | None
2316        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
2317        method with various parameters including the output file, query, threads, sort flag, and index
2318        flag. The `export_output` method is responsible for exporting the VCF data based on the
2319        specified parameters and configurations provided in the `export_variant_vcf` function.
2320        """
2321
2322        # Config
2323        config = self.get_config()
2324
2325        # Extract VCF
2326        log.debug("Export VCF...")
2327
2328        # Table variants
2329        table_variants = self.get_table_variants()
2330
2331        # Threads
2332        if not threads:
2333            threads = self.get_threads()
2334
2335        # Info fields
2336        if remove_info:
2337            if not isinstance(remove_info, str):
2338                remove_info = "."
2339            info_field = f"""'{remove_info}' as INFO"""
2340        else:
2341            info_field = "INFO"
2342
2343        # Samples fields
2344        if add_samples:
2345            if not list_samples:
2346                list_samples = self.get_header_sample_list()
2347            if list_samples:
2348                samples_fields = " , FORMAT , " + " , ".join(list_samples)
2349            else:
2350                samples_fields = ""
2351            log.debug(f"samples_fields: {samples_fields}")
2352        else:
2353            samples_fields = ""
2354
2355        # Where clause
2356        if where_clause is None:
2357            where_clause = ""
2358
2359        # Variants
2360        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
2361        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
2362        log.debug(f"sql_query_select={sql_query_select}")
2363
2364        return self.export_output(
2365            output_file=vcf_file,
2366            output_header=None,
2367            export_header=True,
2368            query=sql_query_select,
2369            parquet_partitions=None,
2370            chunk_size=config.get("chunk_size", None),
2371            threads=threads,
2372            sort=True,
2373            index=index,
2374            order_by=None,
2375        )
2376
2377    def run_commands(self, commands: list = [], threads: int = 1) -> None:
2378        """
2379        It takes a list of commands and runs them in parallel using the number of threads specified
2380
2381        :param commands: A list of commands to run
2382        :param threads: The number of threads to use, defaults to 1 (optional)
2383        """
2384
2385        run_parallel_commands(commands, threads)
2386
2387    def get_threads(self, default: int = 1) -> int:
2388        """
2389        This function returns the number of threads to use for a job, with a default value of 1 if not
2390        specified.
2391
2392        :param default: The `default` parameter in the `get_threads` method is used to specify the
2393        default number of threads to use if no specific value is provided. If no value is provided for
2394        the `threads` parameter in the configuration or input parameters, the `default` value will be
2395        used, defaults to 1
2396        :type default: int (optional)
2397        :return: the number of threads to use for the current job.
2398        """
2399
2400        # Config
2401        config = self.get_config()
2402
2403        # Param
2404        param = self.get_param()
2405
2406        # Input threads
2407        input_thread = param.get("threads", config.get("threads", None))
2408
2409        # Check threads
2410        if not input_thread:
2411            threads = default
2412        elif int(input_thread) <= 0:
2413            threads = os.cpu_count()
2414        else:
2415            threads = int(input_thread)
2416        return threads
2417
2418    def get_memory(self, default: str = None) -> str:
2419        """
2420        This function retrieves the memory value from parameters or configuration with a default value
2421        if not found.
2422
2423        :param default: The `get_memory` function takes in a default value as a string parameter. This
2424        default value is used as a fallback in case the `memory` parameter is not provided in the
2425        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
2426        the function
2427        :type default: str
2428        :return: The `get_memory` function returns a string value representing the memory parameter. If
2429        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
2430        return the default value provided as an argument to the function.
2431        """
2432
2433        # Config
2434        config = self.get_config()
2435
2436        # Param
2437        param = self.get_param()
2438
2439        # Input threads
2440        input_memory = param.get("memory", config.get("memory", None))
2441
2442        # Check threads
2443        if input_memory:
2444            memory = input_memory
2445        else:
2446            memory = default
2447
2448        return memory
2449
2450    def update_from_vcf(self, vcf_file: str) -> None:
2451        """
2452        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
2453
2454        :param vcf_file: the path to the VCF file
2455        """
2456
2457        connexion_format = self.get_connexion_format()
2458
2459        if connexion_format in ["duckdb"]:
2460            self.update_from_vcf_duckdb(vcf_file)
2461        elif connexion_format in ["sqlite"]:
2462            self.update_from_vcf_sqlite(vcf_file)
2463
2464    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2465        """
2466        It takes a VCF file and updates the INFO column of the variants table in the database with the
2467        INFO column of the VCF file
2468
2469        :param vcf_file: the path to the VCF file
2470        """
2471
2472        # varaints table
2473        table_variants = self.get_table_variants()
2474
2475        # Loading VCF into temporaire table
2476        skip = self.get_header_length(file=vcf_file)
2477        vcf_df = pd.read_csv(
2478            vcf_file,
2479            sep="\t",
2480            engine="c",
2481            skiprows=skip,
2482            header=0,
2483            low_memory=False,
2484        )
2485        sql_query_update = f"""
2486        UPDATE {table_variants} as table_variants
2487            SET INFO = concat(
2488                            CASE
2489                                WHEN INFO NOT IN ('', '.')
2490                                THEN INFO
2491                                ELSE ''
2492                            END,
2493                            (
2494                                SELECT 
2495                                    concat(
2496                                        CASE
2497                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
2498                                            THEN ';'
2499                                            ELSE ''
2500                                        END
2501                                        ,
2502                                        CASE
2503                                            WHEN table_parquet.INFO NOT IN ('','.')
2504                                            THEN table_parquet.INFO
2505                                            ELSE ''
2506                                        END
2507                                    )
2508                                FROM vcf_df as table_parquet
2509                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
2510                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
2511                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
2512                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
2513                                        AND table_parquet.INFO NOT IN ('','.')
2514                            )
2515                        )
2516            ;
2517            """
2518        self.conn.execute(sql_query_update)
2519
2520    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2521        """
2522        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
2523        table, then updates the INFO column of the variants table with the INFO column of the temporary
2524        table
2525
2526        :param vcf_file: The path to the VCF file you want to update the database with
2527        """
2528
2529        # Create a temporary table for the VCF
2530        table_vcf = "tmp_vcf"
2531        sql_create = (
2532            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
2533        )
2534        self.conn.execute(sql_create)
2535
2536        # Loading VCF into temporaire table
2537        vcf_df = pd.read_csv(
2538            vcf_file, sep="\t", comment="#", header=None, low_memory=False
2539        )
2540        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
2541        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
2542
2543        # Update table 'variants' with VCF data
2544        # warning: CONCAT as || operator
2545        sql_query_update = f"""
2546            UPDATE variants as table_variants
2547            SET INFO = CASE
2548                            WHEN INFO NOT IN ('', '.')
2549                            THEN INFO
2550                            ELSE ''
2551                        END ||
2552                        (
2553                        SELECT 
2554                            CASE 
2555                                WHEN table_variants.INFO NOT IN ('','.') 
2556                                    AND table_vcf.INFO NOT IN ('','.')  
2557                                THEN ';' 
2558                                ELSE '' 
2559                            END || 
2560                            CASE 
2561                                WHEN table_vcf.INFO NOT IN ('','.') 
2562                                THEN table_vcf.INFO 
2563                                ELSE '' 
2564                            END
2565                        FROM {table_vcf} as table_vcf
2566                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
2567                            AND table_vcf.\"POS\" = table_variants.\"POS\"
2568                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
2569                            AND table_vcf.\"REF\" = table_variants.\"REF\"
2570                        )
2571        """
2572        self.conn.execute(sql_query_update)
2573
2574        # Drop temporary table
2575        sql_drop = f"DROP TABLE {table_vcf}"
2576        self.conn.execute(sql_drop)
2577
2578    def drop_variants_table(self) -> None:
2579        """
2580        > This function drops the variants table
2581        """
2582
2583        table_variants = self.get_table_variants()
2584        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
2585        self.conn.execute(sql_table_variants)
2586
2587    def set_variant_id(
2588        self, variant_id_column: str = "variant_id", force: bool = None
2589    ) -> str:
2590        """
2591        It adds a column to the variants table called `variant_id` and populates it with a hash of the
2592        `#CHROM`, `POS`, `REF`, and `ALT` columns
2593
2594        :param variant_id_column: The name of the column to be created in the variants table, defaults
2595        to variant_id
2596        :type variant_id_column: str (optional)
2597        :param force: If True, the variant_id column will be created even if it already exists
2598        :type force: bool
2599        :return: The name of the column that contains the variant_id
2600        """
2601
2602        # Assembly
2603        assembly = self.get_param().get(
2604            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
2605        )
2606
2607        # INFO/Tag prefix
2608        prefix = self.get_explode_infos_prefix()
2609
2610        # Explode INFO/SVTYPE
2611        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
2612
2613        # variants table
2614        table_variants = self.get_table_variants()
2615
2616        # variant_id column
2617        if not variant_id_column:
2618            variant_id_column = "variant_id"
2619
2620        # Creta variant_id column
2621        if "variant_id" not in self.get_extra_infos() or force:
2622
2623            # Create column
2624            self.add_column(
2625                table_name=table_variants,
2626                column_name=variant_id_column,
2627                column_type="UBIGINT",
2628                default_value="0",
2629            )
2630
2631            # Update column
2632            self.conn.execute(
2633                f"""
2634                    UPDATE {table_variants}
2635                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
2636                """
2637            )
2638
2639        # Remove added columns
2640        for added_column in added_columns:
2641            self.drop_column(column=added_column)
2642
2643        # return variant_id column name
2644        return variant_id_column
2645
2646    def get_variant_id_column(
2647        self, variant_id_column: str = "variant_id", force: bool = None
2648    ) -> str:
2649        """
2650        This function returns the variant_id column name
2651
2652        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
2653        defaults to variant_id
2654        :type variant_id_column: str (optional)
2655        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
2656        False, will only set the variant_id if it is not already set. If None, will set the variant_id
2657        if it is not already set, or if it is set
2658        :type force: bool
2659        :return: The variant_id column name.
2660        """
2661
2662        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
2663
2664    ###
2665    # Annotation
2666    ###
2667
2668    def scan_databases(
2669        self,
2670        database_formats: list = ["parquet"],
2671        database_releases: list = ["current"],
2672    ) -> dict:
2673        """
2674        The function `scan_databases` scans for available databases based on specified formats and
2675        releases.
2676
2677        :param database_formats: The `database_formats` parameter is a list that specifies the formats
2678        of the databases to be scanned. In this case, the accepted format is "parquet"
2679        :type database_formats: list ["parquet"]
2680        :param database_releases: The `database_releases` parameter is a list that specifies the
2681        releases of the databases to be scanned. In the provided function, the default value for
2682        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
2683        databases that are in the "current"
2684        :type database_releases: list
2685        :return: The function `scan_databases` returns a dictionary containing information about
2686        databases that match the specified formats and releases.
2687        """
2688
2689        # Config
2690        config = self.get_config()
2691
2692        # Param
2693        param = self.get_param()
2694
2695        # Param - Assembly
2696        assembly = param.get("assembly", config.get("assembly", None))
2697        if not assembly:
2698            assembly = DEFAULT_ASSEMBLY
2699            log.warning(f"Default assembly '{assembly}'")
2700
2701        # Scan for availabled databases
2702        log.info(
2703            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
2704        )
2705        databases_infos_dict = databases_infos(
2706            database_folder_releases=database_releases,
2707            database_formats=database_formats,
2708            assembly=assembly,
2709            config=config,
2710        )
2711        log.info(
2712            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
2713        )
2714
2715        return databases_infos_dict
2716
2717    def annotation(self) -> None:
2718        """
2719        It annotates the VCF file with the annotations specified in the config file.
2720        """
2721
2722        # Config
2723        config = self.get_config()
2724
2725        # Param
2726        param = self.get_param()
2727
2728        # Param - Assembly
2729        assembly = param.get("assembly", config.get("assembly", None))
2730        if not assembly:
2731            assembly = DEFAULT_ASSEMBLY
2732            log.warning(f"Default assembly '{assembly}'")
2733
2734        # annotations databases folders
2735        annotations_databases = set(
2736            config.get("folders", {})
2737            .get("databases", {})
2738            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
2739            + config.get("folders", {})
2740            .get("databases", {})
2741            .get("parquet", ["~/howard/databases/parquet/current"])
2742            + config.get("folders", {})
2743            .get("databases", {})
2744            .get("bcftools", ["~/howard/databases/bcftools/current"])
2745        )
2746
2747        # Get param annotations
2748        if param.get("annotations", None) and isinstance(
2749            param.get("annotations", None), str
2750        ):
2751            log.debug(param.get("annotations", None))
2752            param_annotation_list = param.get("annotations").split(",")
2753        else:
2754            param_annotation_list = []
2755
2756        # Each tools param
2757        if param.get("annotation_parquet", None) != None:
2758            log.debug(
2759                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
2760            )
2761            if isinstance(param.get("annotation_parquet", None), list):
2762                param_annotation_list.append(",".join(param.get("annotation_parquet")))
2763            else:
2764                param_annotation_list.append(param.get("annotation_parquet"))
2765        if param.get("annotation_snpsift", None) != None:
2766            if isinstance(param.get("annotation_snpsift", None), list):
2767                param_annotation_list.append(
2768                    "snpsift:"
2769                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
2770                )
2771            else:
2772                param_annotation_list.append(
2773                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
2774                )
2775        if param.get("annotation_snpeff", None) != None:
2776            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
2777        if param.get("annotation_bcftools", None) != None:
2778            if isinstance(param.get("annotation_bcftools", None), list):
2779                param_annotation_list.append(
2780                    "bcftools:"
2781                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
2782                )
2783            else:
2784                param_annotation_list.append(
2785                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
2786                )
2787        if param.get("annotation_annovar", None) != None:
2788            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
2789        if param.get("annotation_exomiser", None) != None:
2790            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
2791        if param.get("annotation_splice", None) != None:
2792            param_annotation_list.append("splice:" + param.get("annotation_splice"))
2793
2794        # Merge param annotations list
2795        param["annotations"] = ",".join(param_annotation_list)
2796
2797        # debug
2798        log.debug(f"param_annotations={param['annotations']}")
2799
2800        if param.get("annotations"):
2801
2802            # Log
2803            # log.info("Annotations - Check annotation parameters")
2804
2805            if not "annotation" in param:
2806                param["annotation"] = {}
2807
2808            # List of annotations parameters
2809            annotations_list_input = {}
2810            if isinstance(param.get("annotations", None), str):
2811                annotation_file_list = [
2812                    value for value in param.get("annotations", "").split(",")
2813                ]
2814                for annotation_file in annotation_file_list:
2815                    annotations_list_input[annotation_file] = {"INFO": None}
2816            else:
2817                annotations_list_input = param.get("annotations", {})
2818
2819            log.info(f"Quick Annotations:")
2820            for annotation_key in list(annotations_list_input.keys()):
2821                log.info(f"   {annotation_key}")
2822
2823            # List of annotations and associated fields
2824            annotations_list = {}
2825
2826            for annotation_file in annotations_list_input:
2827
2828                # Explode annotations if ALL
2829                if (
2830                    annotation_file.upper() == "ALL"
2831                    or annotation_file.upper().startswith("ALL:")
2832                ):
2833
2834                    # check ALL parameters (formats, releases)
2835                    annotation_file_split = annotation_file.split(":")
2836                    database_formats = "parquet"
2837                    database_releases = "current"
2838                    for annotation_file_option in annotation_file_split[1:]:
2839                        database_all_options_split = annotation_file_option.split("=")
2840                        if database_all_options_split[0] == "format":
2841                            database_formats = database_all_options_split[1].split("+")
2842                        if database_all_options_split[0] == "release":
2843                            database_releases = database_all_options_split[1].split("+")
2844
2845                    # Scan for availabled databases
2846                    databases_infos_dict = self.scan_databases(
2847                        database_formats=database_formats,
2848                        database_releases=database_releases,
2849                    )
2850
2851                    # Add found databases in annotation parameters
2852                    for database_infos in databases_infos_dict.keys():
2853                        annotations_list[database_infos] = {"INFO": None}
2854
2855                else:
2856                    annotations_list[annotation_file] = annotations_list_input[
2857                        annotation_file
2858                    ]
2859
2860            # Check each databases
2861            if len(annotations_list):
2862
2863                log.info(
2864                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
2865                )
2866
2867                for annotation_file in annotations_list:
2868
2869                    # Init
2870                    annotations = annotations_list.get(annotation_file, None)
2871
2872                    # Annotation snpEff
2873                    if annotation_file.startswith("snpeff"):
2874
2875                        log.debug(f"Quick Annotation snpEff")
2876
2877                        if "snpeff" not in param["annotation"]:
2878                            param["annotation"]["snpeff"] = {}
2879
2880                        if "options" not in param["annotation"]["snpeff"]:
2881                            param["annotation"]["snpeff"]["options"] = ""
2882
2883                        # snpEff options in annotations
2884                        param["annotation"]["snpeff"]["options"] = "".join(
2885                            annotation_file.split(":")[1:]
2886                        )
2887
2888                    # Annotation Annovar
2889                    elif annotation_file.startswith("annovar"):
2890
2891                        log.debug(f"Quick Annotation Annovar")
2892
2893                        if "annovar" not in param["annotation"]:
2894                            param["annotation"]["annovar"] = {}
2895
2896                        if "annotations" not in param["annotation"]["annovar"]:
2897                            param["annotation"]["annovar"]["annotations"] = {}
2898
2899                        # Options
2900                        annotation_file_split = annotation_file.split(":")
2901                        for annotation_file_annotation in annotation_file_split[1:]:
2902                            if annotation_file_annotation:
2903                                param["annotation"]["annovar"]["annotations"][
2904                                    annotation_file_annotation
2905                                ] = annotations
2906
2907                    # Annotation Exomiser
2908                    elif annotation_file.startswith("exomiser"):
2909
2910                        log.debug(f"Quick Annotation Exomiser")
2911
2912                        param["annotation"]["exomiser"] = params_string_to_dict(
2913                            annotation_file
2914                        )
2915
2916                    # Annotation Splice
2917                    elif annotation_file.startswith("splice"):
2918
2919                        log.debug(f"Quick Annotation Splice")
2920
2921                        param["annotation"]["splice"] = params_string_to_dict(
2922                            annotation_file
2923                        )
2924
2925                    # Annotation Parquet or BCFTOOLS
2926                    else:
2927
2928                        # Tools detection
2929                        if annotation_file.startswith("bcftools:"):
2930                            annotation_tool_initial = "bcftools"
2931                            annotation_file = ":".join(annotation_file.split(":")[1:])
2932                        elif annotation_file.startswith("snpsift:"):
2933                            annotation_tool_initial = "snpsift"
2934                            annotation_file = ":".join(annotation_file.split(":")[1:])
2935                        else:
2936                            annotation_tool_initial = None
2937
2938                        # list of files
2939                        annotation_file_list = annotation_file.replace("+", ":").split(
2940                            ":"
2941                        )
2942
2943                        for annotation_file in annotation_file_list:
2944
2945                            if annotation_file:
2946
2947                                # Annotation tool initial
2948                                annotation_tool = annotation_tool_initial
2949
2950                                # Find file
2951                                annotation_file_found = None
2952
2953                                # Expand user
2954                                annotation_file = full_path(annotation_file)
2955
2956                                if os.path.exists(annotation_file):
2957                                    annotation_file_found = annotation_file
2958
2959                                else:
2960                                    # Find within assembly folders
2961                                    for annotations_database in annotations_databases:
2962                                        found_files = find_all(
2963                                            annotation_file,
2964                                            os.path.join(
2965                                                annotations_database, assembly
2966                                            ),
2967                                        )
2968                                        if len(found_files) > 0:
2969                                            annotation_file_found = found_files[0]
2970                                            break
2971                                    if not annotation_file_found and not assembly:
2972                                        # Find within folders
2973                                        for (
2974                                            annotations_database
2975                                        ) in annotations_databases:
2976                                            found_files = find_all(
2977                                                annotation_file, annotations_database
2978                                            )
2979                                            if len(found_files) > 0:
2980                                                annotation_file_found = found_files[0]
2981                                                break
2982                                log.debug(
2983                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
2984                                )
2985
2986                                # Full path
2987                                annotation_file_found = full_path(annotation_file_found)
2988
2989                                if annotation_file_found:
2990
2991                                    database = Database(database=annotation_file_found)
2992                                    quick_annotation_format = database.get_format()
2993                                    quick_annotation_is_compressed = (
2994                                        database.is_compressed()
2995                                    )
2996                                    quick_annotation_is_indexed = os.path.exists(
2997                                        f"{annotation_file_found}.tbi"
2998                                    )
2999                                    bcftools_preference = False
3000
3001                                    # Check Annotation Tool
3002                                    if not annotation_tool:
3003                                        if (
3004                                            bcftools_preference
3005                                            and quick_annotation_format
3006                                            in ["vcf", "bed"]
3007                                            and quick_annotation_is_compressed
3008                                            and quick_annotation_is_indexed
3009                                        ):
3010                                            annotation_tool = "bcftools"
3011                                        elif quick_annotation_format in [
3012                                            "vcf",
3013                                            "bed",
3014                                            "tsv",
3015                                            "tsv",
3016                                            "csv",
3017                                            "json",
3018                                            "tbl",
3019                                            "parquet",
3020                                            "duckdb",
3021                                        ]:
3022                                            annotation_tool = "parquet"
3023                                        else:
3024                                            log.error(
3025                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3026                                            )
3027                                            raise ValueError(
3028                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3029                                            )
3030
3031                                    log.debug(
3032                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
3033                                    )
3034
3035                                    # Annotation Tool dispatch
3036                                    if annotation_tool:
3037                                        if annotation_tool not in param["annotation"]:
3038                                            param["annotation"][annotation_tool] = {}
3039                                        if (
3040                                            "annotations"
3041                                            not in param["annotation"][annotation_tool]
3042                                        ):
3043                                            param["annotation"][annotation_tool][
3044                                                "annotations"
3045                                            ] = {}
3046                                        param["annotation"][annotation_tool][
3047                                            "annotations"
3048                                        ][annotation_file_found] = annotations
3049
3050                                else:
3051                                    log.error(
3052                                        f"Quick Annotation File {annotation_file} does NOT exist"
3053                                    )
3054
3055                self.set_param(param)
3056
3057        if param.get("annotation", None):
3058            log.info("Annotations")
3059            if param.get("annotation", {}).get("parquet", None):
3060                log.info("Annotations 'parquet'...")
3061                self.annotation_parquet()
3062            if param.get("annotation", {}).get("bcftools", None):
3063                log.info("Annotations 'bcftools'...")
3064                self.annotation_bcftools()
3065            if param.get("annotation", {}).get("snpsift", None):
3066                log.info("Annotations 'snpsift'...")
3067                self.annotation_snpsift()
3068            if param.get("annotation", {}).get("annovar", None):
3069                log.info("Annotations 'annovar'...")
3070                self.annotation_annovar()
3071            if param.get("annotation", {}).get("snpeff", None):
3072                log.info("Annotations 'snpeff'...")
3073                self.annotation_snpeff()
3074            if param.get("annotation", {}).get("exomiser", None) is not None:
3075                log.info("Annotations 'exomiser'...")
3076                self.annotation_exomiser()
3077            if param.get("annotation", {}).get("splice", None) is not None:
3078                log.info("Annotations 'splice' ...")
3079                self.annotation_splice()
3080
3081        # Explode INFOS fields into table fields
3082        if self.get_explode_infos():
3083            self.explode_infos(
3084                prefix=self.get_explode_infos_prefix(),
3085                fields=self.get_explode_infos_fields(),
3086                force=True,
3087            )
3088
3089    def annotation_snpsift(self, threads: int = None) -> None:
3090        """
3091        This function annotate with bcftools
3092
3093        :param threads: Number of threads to use
3094        :return: the value of the variable "return_value".
3095        """
3096
3097        # DEBUG
3098        log.debug("Start annotation with bcftools databases")
3099
3100        # Threads
3101        if not threads:
3102            threads = self.get_threads()
3103        log.debug("Threads: " + str(threads))
3104
3105        # Config
3106        config = self.get_config()
3107        log.debug("Config: " + str(config))
3108
3109        # Config - snpSift
3110        snpsift_bin_command = get_bin_command(
3111            bin="SnpSift.jar",
3112            tool="snpsift",
3113            bin_type="jar",
3114            config=config,
3115            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
3116        )
3117        if not snpsift_bin_command:
3118            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
3119            log.error(msg_err)
3120            raise ValueError(msg_err)
3121
3122        # Config - bcftools
3123        bcftools_bin_command = get_bin_command(
3124            bin="bcftools",
3125            tool="bcftools",
3126            bin_type="bin",
3127            config=config,
3128            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3129        )
3130        if not bcftools_bin_command:
3131            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3132            log.error(msg_err)
3133            raise ValueError(msg_err)
3134
3135        # Config - BCFTools databases folders
3136        databases_folders = set(
3137            self.get_config()
3138            .get("folders", {})
3139            .get("databases", {})
3140            .get("annotations", ["."])
3141            + self.get_config()
3142            .get("folders", {})
3143            .get("databases", {})
3144            .get("bcftools", ["."])
3145        )
3146        log.debug("Databases annotations: " + str(databases_folders))
3147
3148        # Param
3149        annotations = (
3150            self.get_param()
3151            .get("annotation", {})
3152            .get("snpsift", {})
3153            .get("annotations", None)
3154        )
3155        log.debug("Annotations: " + str(annotations))
3156
3157        # Assembly
3158        assembly = self.get_param().get(
3159            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3160        )
3161
3162        # Data
3163        table_variants = self.get_table_variants()
3164
3165        # Check if not empty
3166        log.debug("Check if not empty")
3167        sql_query_chromosomes = (
3168            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3169        )
3170        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3171        if not sql_query_chromosomes_df["count"][0]:
3172            log.info(f"VCF empty")
3173            return
3174
3175        # VCF header
3176        vcf_reader = self.get_header()
3177        log.debug("Initial header: " + str(vcf_reader.infos))
3178
3179        # Existing annotations
3180        for vcf_annotation in self.get_header().infos:
3181
3182            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3183            log.debug(
3184                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3185            )
3186
3187        if annotations:
3188
3189            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3190
3191                # Export VCF file
3192                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3193
3194                # Init
3195                commands = {}
3196
3197                for annotation in annotations:
3198                    annotation_fields = annotations[annotation]
3199
3200                    # Annotation Name
3201                    annotation_name = os.path.basename(annotation)
3202
3203                    if not annotation_fields:
3204                        annotation_fields = {"INFO": None}
3205
3206                    log.debug(f"Annotation '{annotation_name}'")
3207                    log.debug(
3208                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3209                    )
3210
3211                    # Create Database
3212                    database = Database(
3213                        database=annotation,
3214                        databases_folders=databases_folders,
3215                        assembly=assembly,
3216                    )
3217
3218                    # Find files
3219                    db_file = database.get_database()
3220                    db_file = full_path(db_file)
3221                    db_hdr_file = database.get_header_file()
3222                    db_hdr_file = full_path(db_hdr_file)
3223                    db_file_type = database.get_format()
3224                    db_tbi_file = f"{db_file}.tbi"
3225                    db_file_compressed = database.is_compressed()
3226
3227                    # Check if compressed
3228                    if not db_file_compressed:
3229                        log.error(
3230                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3231                        )
3232                        raise ValueError(
3233                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3234                        )
3235
3236                    # Check if indexed
3237                    if not os.path.exists(db_tbi_file):
3238                        log.error(
3239                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3240                        )
3241                        raise ValueError(
3242                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3243                        )
3244
3245                    # Check index - try to create if not exists
3246                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3247                        log.error("Annotation failed: database not valid")
3248                        log.error(f"Annotation annotation file: {db_file}")
3249                        log.error(f"Annotation annotation header: {db_hdr_file}")
3250                        log.error(f"Annotation annotation index: {db_tbi_file}")
3251                        raise ValueError(
3252                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3253                        )
3254                    else:
3255
3256                        log.debug(
3257                            f"Annotation '{annotation}' - file: "
3258                            + str(db_file)
3259                            + " and "
3260                            + str(db_hdr_file)
3261                        )
3262
3263                        # Load header as VCF object
3264                        db_hdr_vcf = Variants(input=db_hdr_file)
3265                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3266                        log.debug(
3267                            "Annotation database header: "
3268                            + str(db_hdr_vcf_header_infos)
3269                        )
3270
3271                        # For all fields in database
3272                        annotation_fields_full = False
3273                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3274                            annotation_fields = {
3275                                key: key for key in db_hdr_vcf_header_infos
3276                            }
3277                            log.debug(
3278                                "Annotation database header - All annotations added: "
3279                                + str(annotation_fields)
3280                            )
3281                            annotation_fields_full = True
3282
3283                        # # Create file for field rename
3284                        # log.debug("Create file for field rename")
3285                        # tmp_rename = NamedTemporaryFile(
3286                        #     prefix=self.get_prefix(),
3287                        #     dir=self.get_tmp_dir(),
3288                        #     suffix=".rename",
3289                        #     delete=False,
3290                        # )
3291                        # tmp_rename_name = tmp_rename.name
3292                        # tmp_files.append(tmp_rename_name)
3293
3294                        # Number of fields
3295                        nb_annotation_field = 0
3296                        annotation_list = []
3297                        annotation_infos_rename_list = []
3298
3299                        for annotation_field in annotation_fields:
3300
3301                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3302                            annotation_fields_new_name = annotation_fields.get(
3303                                annotation_field, annotation_field
3304                            )
3305                            if not annotation_fields_new_name:
3306                                annotation_fields_new_name = annotation_field
3307
3308                            # Check if field is in DB and if field is not elready in input data
3309                            if (
3310                                annotation_field in db_hdr_vcf.get_header().infos
3311                                and annotation_fields_new_name
3312                                not in self.get_header().infos
3313                            ):
3314
3315                                log.info(
3316                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3317                                )
3318
3319                                # BCFTools annotate param to rename fields
3320                                if annotation_field != annotation_fields_new_name:
3321                                    annotation_infos_rename_list.append(
3322                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3323                                    )
3324
3325                                # Add INFO field to header
3326                                db_hdr_vcf_header_infos_number = (
3327                                    db_hdr_vcf_header_infos[annotation_field].num or "."
3328                                )
3329                                db_hdr_vcf_header_infos_type = (
3330                                    db_hdr_vcf_header_infos[annotation_field].type
3331                                    or "String"
3332                                )
3333                                db_hdr_vcf_header_infos_description = (
3334                                    db_hdr_vcf_header_infos[annotation_field].desc
3335                                    or f"{annotation_field} description"
3336                                )
3337                                db_hdr_vcf_header_infos_source = (
3338                                    db_hdr_vcf_header_infos[annotation_field].source
3339                                    or "unknown"
3340                                )
3341                                db_hdr_vcf_header_infos_version = (
3342                                    db_hdr_vcf_header_infos[annotation_field].version
3343                                    or "unknown"
3344                                )
3345
3346                                vcf_reader.infos[annotation_fields_new_name] = (
3347                                    vcf.parser._Info(
3348                                        annotation_fields_new_name,
3349                                        db_hdr_vcf_header_infos_number,
3350                                        db_hdr_vcf_header_infos_type,
3351                                        db_hdr_vcf_header_infos_description,
3352                                        db_hdr_vcf_header_infos_source,
3353                                        db_hdr_vcf_header_infos_version,
3354                                        self.code_type_map[
3355                                            db_hdr_vcf_header_infos_type
3356                                        ],
3357                                    )
3358                                )
3359
3360                                annotation_list.append(annotation_field)
3361
3362                                nb_annotation_field += 1
3363
3364                            else:
3365
3366                                if (
3367                                    annotation_field
3368                                    not in db_hdr_vcf.get_header().infos
3369                                ):
3370                                    log.warning(
3371                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
3372                                    )
3373                                if (
3374                                    annotation_fields_new_name
3375                                    in self.get_header().infos
3376                                ):
3377                                    log.warning(
3378                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
3379                                    )
3380
3381                        log.info(
3382                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3383                        )
3384
3385                        annotation_infos = ",".join(annotation_list)
3386
3387                        if annotation_infos != "":
3388
3389                            # Annotated VCF (and error file)
3390                            tmp_annotation_vcf_name = os.path.join(
3391                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
3392                            )
3393                            tmp_annotation_vcf_name_err = (
3394                                tmp_annotation_vcf_name + ".err"
3395                            )
3396
3397                            # Add fields to annotate
3398                            if not annotation_fields_full:
3399                                annotation_infos_option = f"-info {annotation_infos}"
3400                            else:
3401                                annotation_infos_option = ""
3402
3403                            # Info fields rename
3404                            if annotation_infos_rename_list:
3405                                annotation_infos_rename = " -c " + ",".join(
3406                                    annotation_infos_rename_list
3407                                )
3408                            else:
3409                                annotation_infos_rename = ""
3410
3411                            # Annotate command
3412                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3413
3414                            # Add command
3415                            commands[command_annotate] = tmp_annotation_vcf_name
3416
3417                if commands:
3418
3419                    # Export VCF file
3420                    self.export_variant_vcf(
3421                        vcf_file=tmp_vcf_name,
3422                        remove_info=True,
3423                        add_samples=False,
3424                        index=True,
3425                    )
3426                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
3427
3428                    # Num command
3429                    nb_command = 0
3430
3431                    # Annotate
3432                    for command_annotate in commands:
3433                        nb_command += 1
3434                        log.info(
3435                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
3436                        )
3437                        log.debug(f"command_annotate={command_annotate}")
3438                        run_parallel_commands([command_annotate], threads)
3439
3440                        # Debug
3441                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
3442
3443                        # Update variants
3444                        log.info(
3445                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
3446                        )
3447                        self.update_from_vcf(commands[command_annotate])
3448
3449    def annotation_bcftools(self, threads: int = None) -> None:
3450        """
3451        This function annotate with bcftools
3452
3453        :param threads: Number of threads to use
3454        :return: the value of the variable "return_value".
3455        """
3456
3457        # DEBUG
3458        log.debug("Start annotation with bcftools databases")
3459
3460        # Threads
3461        if not threads:
3462            threads = self.get_threads()
3463        log.debug("Threads: " + str(threads))
3464
3465        # Config
3466        config = self.get_config()
3467        log.debug("Config: " + str(config))
3468
3469        # DEBUG
3470        delete_tmp = True
3471        if self.get_config().get("verbosity", "warning") in ["debug"]:
3472            delete_tmp = False
3473            log.debug("Delete tmp files/folders: " + str(delete_tmp))
3474
3475        # Config - BCFTools bin command
3476        bcftools_bin_command = get_bin_command(
3477            bin="bcftools",
3478            tool="bcftools",
3479            bin_type="bin",
3480            config=config,
3481            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3482        )
3483        if not bcftools_bin_command:
3484            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3485            log.error(msg_err)
3486            raise ValueError(msg_err)
3487
3488        # Config - BCFTools databases folders
3489        databases_folders = set(
3490            self.get_config()
3491            .get("folders", {})
3492            .get("databases", {})
3493            .get("annotations", ["."])
3494            + self.get_config()
3495            .get("folders", {})
3496            .get("databases", {})
3497            .get("bcftools", ["."])
3498        )
3499        log.debug("Databases annotations: " + str(databases_folders))
3500
3501        # Param
3502        annotations = (
3503            self.get_param()
3504            .get("annotation", {})
3505            .get("bcftools", {})
3506            .get("annotations", None)
3507        )
3508        log.debug("Annotations: " + str(annotations))
3509
3510        # Assembly
3511        assembly = self.get_param().get(
3512            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3513        )
3514
3515        # Data
3516        table_variants = self.get_table_variants()
3517
3518        # Check if not empty
3519        log.debug("Check if not empty")
3520        sql_query_chromosomes = (
3521            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3522        )
3523        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3524        if not sql_query_chromosomes_df["count"][0]:
3525            log.info(f"VCF empty")
3526            return
3527
3528        # Export in VCF
3529        log.debug("Create initial file to annotate")
3530        tmp_vcf = NamedTemporaryFile(
3531            prefix=self.get_prefix(),
3532            dir=self.get_tmp_dir(),
3533            suffix=".vcf.gz",
3534            delete=False,
3535        )
3536        tmp_vcf_name = tmp_vcf.name
3537
3538        # VCF header
3539        vcf_reader = self.get_header()
3540        log.debug("Initial header: " + str(vcf_reader.infos))
3541
3542        # Existing annotations
3543        for vcf_annotation in self.get_header().infos:
3544
3545            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3546            log.debug(
3547                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3548            )
3549
3550        if annotations:
3551
3552            tmp_ann_vcf_list = []
3553            commands = []
3554            tmp_files = []
3555            err_files = []
3556
3557            for annotation in annotations:
3558                annotation_fields = annotations[annotation]
3559
3560                # Annotation Name
3561                annotation_name = os.path.basename(annotation)
3562
3563                if not annotation_fields:
3564                    annotation_fields = {"INFO": None}
3565
3566                log.debug(f"Annotation '{annotation_name}'")
3567                log.debug(
3568                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3569                )
3570
3571                # Create Database
3572                database = Database(
3573                    database=annotation,
3574                    databases_folders=databases_folders,
3575                    assembly=assembly,
3576                )
3577
3578                # Find files
3579                db_file = database.get_database()
3580                db_file = full_path(db_file)
3581                db_hdr_file = database.get_header_file()
3582                db_hdr_file = full_path(db_hdr_file)
3583                db_file_type = database.get_format()
3584                db_tbi_file = f"{db_file}.tbi"
3585                db_file_compressed = database.is_compressed()
3586
3587                # Check if compressed
3588                if not db_file_compressed:
3589                    log.error(
3590                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3591                    )
3592                    raise ValueError(
3593                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3594                    )
3595
3596                # Check if indexed
3597                if not os.path.exists(db_tbi_file):
3598                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
3599                    raise ValueError(
3600                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
3601                    )
3602
3603                # Check index - try to create if not exists
3604                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3605                    log.error("Annotation failed: database not valid")
3606                    log.error(f"Annotation annotation file: {db_file}")
3607                    log.error(f"Annotation annotation header: {db_hdr_file}")
3608                    log.error(f"Annotation annotation index: {db_tbi_file}")
3609                    raise ValueError(
3610                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3611                    )
3612                else:
3613
3614                    log.debug(
3615                        f"Annotation '{annotation}' - file: "
3616                        + str(db_file)
3617                        + " and "
3618                        + str(db_hdr_file)
3619                    )
3620
3621                    # Load header as VCF object
3622                    db_hdr_vcf = Variants(input=db_hdr_file)
3623                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3624                    log.debug(
3625                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
3626                    )
3627
3628                    # For all fields in database
3629                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
3630                        annotation_fields = {
3631                            key: key for key in db_hdr_vcf_header_infos
3632                        }
3633                        log.debug(
3634                            "Annotation database header - All annotations added: "
3635                            + str(annotation_fields)
3636                        )
3637
3638                    # Number of fields
3639                    nb_annotation_field = 0
3640                    annotation_list = []
3641
3642                    for annotation_field in annotation_fields:
3643
3644                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3645                        annotation_fields_new_name = annotation_fields.get(
3646                            annotation_field, annotation_field
3647                        )
3648                        if not annotation_fields_new_name:
3649                            annotation_fields_new_name = annotation_field
3650
3651                        # Check if field is in DB and if field is not elready in input data
3652                        if (
3653                            annotation_field in db_hdr_vcf.get_header().infos
3654                            and annotation_fields_new_name
3655                            not in self.get_header().infos
3656                        ):
3657
3658                            log.info(
3659                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3660                            )
3661
3662                            # Add INFO field to header
3663                            db_hdr_vcf_header_infos_number = (
3664                                db_hdr_vcf_header_infos[annotation_field].num or "."
3665                            )
3666                            db_hdr_vcf_header_infos_type = (
3667                                db_hdr_vcf_header_infos[annotation_field].type
3668                                or "String"
3669                            )
3670                            db_hdr_vcf_header_infos_description = (
3671                                db_hdr_vcf_header_infos[annotation_field].desc
3672                                or f"{annotation_field} description"
3673                            )
3674                            db_hdr_vcf_header_infos_source = (
3675                                db_hdr_vcf_header_infos[annotation_field].source
3676                                or "unknown"
3677                            )
3678                            db_hdr_vcf_header_infos_version = (
3679                                db_hdr_vcf_header_infos[annotation_field].version
3680                                or "unknown"
3681                            )
3682
3683                            vcf_reader.infos[annotation_fields_new_name] = (
3684                                vcf.parser._Info(
3685                                    annotation_fields_new_name,
3686                                    db_hdr_vcf_header_infos_number,
3687                                    db_hdr_vcf_header_infos_type,
3688                                    db_hdr_vcf_header_infos_description,
3689                                    db_hdr_vcf_header_infos_source,
3690                                    db_hdr_vcf_header_infos_version,
3691                                    self.code_type_map[db_hdr_vcf_header_infos_type],
3692                                )
3693                            )
3694
3695                            # annotation_list.append(annotation_field)
3696                            if annotation_field != annotation_fields_new_name:
3697                                annotation_list.append(
3698                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3699                                )
3700                            else:
3701                                annotation_list.append(annotation_field)
3702
3703                            nb_annotation_field += 1
3704
3705                        else:
3706
3707                            if annotation_field not in db_hdr_vcf.get_header().infos:
3708                                log.warning(
3709                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
3710                                )
3711                            if annotation_fields_new_name in self.get_header().infos:
3712                                log.warning(
3713                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
3714                                )
3715
3716                    log.info(
3717                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3718                    )
3719
3720                    annotation_infos = ",".join(annotation_list)
3721
3722                    if annotation_infos != "":
3723
3724                        # Protect header for bcftools (remove "#CHROM" and variants line)
3725                        log.debug("Protect Header file - remove #CHROM line if exists")
3726                        tmp_header_vcf = NamedTemporaryFile(
3727                            prefix=self.get_prefix(),
3728                            dir=self.get_tmp_dir(),
3729                            suffix=".hdr",
3730                            delete=False,
3731                        )
3732                        tmp_header_vcf_name = tmp_header_vcf.name
3733                        tmp_files.append(tmp_header_vcf_name)
3734                        # Command
3735                        if db_hdr_file.endswith(".gz"):
3736                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3737                        else:
3738                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3739                        # Run
3740                        run_parallel_commands([command_extract_header], 1)
3741
3742                        # Find chomosomes
3743                        log.debug("Find chromosomes ")
3744                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
3745                        sql_query_chromosomes_df = self.get_query_to_df(
3746                            sql_query_chromosomes
3747                        )
3748                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
3749
3750                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
3751
3752                        # BED columns in the annotation file
3753                        if db_file_type in ["bed"]:
3754                            annotation_infos = "CHROM,POS,POS," + annotation_infos
3755
3756                        for chrom in chomosomes_list:
3757
3758                            # Create BED on initial VCF
3759                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
3760                            tmp_bed = NamedTemporaryFile(
3761                                prefix=self.get_prefix(),
3762                                dir=self.get_tmp_dir(),
3763                                suffix=".bed",
3764                                delete=False,
3765                            )
3766                            tmp_bed_name = tmp_bed.name
3767                            tmp_files.append(tmp_bed_name)
3768
3769                            # Detecte regions
3770                            log.debug(
3771                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
3772                            )
3773                            window = 1000000
3774                            sql_query_intervals_for_bed = f"""
3775                                SELECT  \"#CHROM\",
3776                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
3777                                        \"POS\"+{window}
3778                                FROM {table_variants} as table_variants
3779                                WHERE table_variants.\"#CHROM\" = '{chrom}'
3780                            """
3781                            regions = self.conn.execute(
3782                                sql_query_intervals_for_bed
3783                            ).fetchall()
3784                            merged_regions = merge_regions(regions)
3785                            log.debug(
3786                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
3787                            )
3788
3789                            header = ["#CHROM", "START", "END"]
3790                            with open(tmp_bed_name, "w") as f:
3791                                # Write the header with tab delimiter
3792                                f.write("\t".join(header) + "\n")
3793                                for d in merged_regions:
3794                                    # Write each data row with tab delimiter
3795                                    f.write("\t".join(map(str, d)) + "\n")
3796
3797                            # Tmp files
3798                            tmp_annotation_vcf = NamedTemporaryFile(
3799                                prefix=self.get_prefix(),
3800                                dir=self.get_tmp_dir(),
3801                                suffix=".vcf.gz",
3802                                delete=False,
3803                            )
3804                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
3805                            tmp_files.append(tmp_annotation_vcf_name)
3806                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
3807                            tmp_annotation_vcf_name_err = (
3808                                tmp_annotation_vcf_name + ".err"
3809                            )
3810                            err_files.append(tmp_annotation_vcf_name_err)
3811
3812                            # Annotate Command
3813                            log.debug(
3814                                f"Annotation '{annotation}' - add bcftools command"
3815                            )
3816
3817                            # Command
3818                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3819
3820                            # Add command
3821                            commands.append(command_annotate)
3822
3823            # if some commands
3824            if commands:
3825
3826                # Export VCF file
3827                self.export_variant_vcf(
3828                    vcf_file=tmp_vcf_name,
3829                    remove_info=True,
3830                    add_samples=False,
3831                    index=True,
3832                )
3833
3834                # Threads
3835                # calculate threads for annotated commands
3836                if commands:
3837                    threads_bcftools_annotate = round(threads / len(commands))
3838                else:
3839                    threads_bcftools_annotate = 1
3840
3841                if not threads_bcftools_annotate:
3842                    threads_bcftools_annotate = 1
3843
3844                # Add threads option to bcftools commands
3845                if threads_bcftools_annotate > 1:
3846                    commands_threaded = []
3847                    for command in commands:
3848                        commands_threaded.append(
3849                            command.replace(
3850                                f"{bcftools_bin_command} annotate ",
3851                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
3852                            )
3853                        )
3854                    commands = commands_threaded
3855
3856                # Command annotation multithreading
3857                log.debug(f"Annotation - Annotation commands: " + str(commands))
3858                log.info(
3859                    f"Annotation - Annotation multithreaded in "
3860                    + str(len(commands))
3861                    + " commands"
3862                )
3863
3864                run_parallel_commands(commands, threads)
3865
3866                # Merge
3867                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
3868
3869                if tmp_ann_vcf_list_cmd:
3870
3871                    # Tmp file
3872                    tmp_annotate_vcf = NamedTemporaryFile(
3873                        prefix=self.get_prefix(),
3874                        dir=self.get_tmp_dir(),
3875                        suffix=".vcf.gz",
3876                        delete=True,
3877                    )
3878                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
3879                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
3880                    err_files.append(tmp_annotate_vcf_name_err)
3881
3882                    # Tmp file remove command
3883                    tmp_files_remove_command = ""
3884                    if tmp_files:
3885                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
3886
3887                    # Command merge
3888                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
3889                    log.info(
3890                        f"Annotation - Annotation merging "
3891                        + str(len(commands))
3892                        + " annotated files"
3893                    )
3894                    log.debug(f"Annotation - merge command: {merge_command}")
3895                    run_parallel_commands([merge_command], 1)
3896
3897                    # Error messages
3898                    log.info(f"Error/Warning messages:")
3899                    error_message_command_all = []
3900                    error_message_command_warning = []
3901                    error_message_command_err = []
3902                    for err_file in err_files:
3903                        with open(err_file, "r") as f:
3904                            for line in f:
3905                                message = line.strip()
3906                                error_message_command_all.append(message)
3907                                if line.startswith("[W::"):
3908                                    error_message_command_warning.append(message)
3909                                if line.startswith("[E::"):
3910                                    error_message_command_err.append(
3911                                        f"{err_file}: " + message
3912                                    )
3913                    # log info
3914                    for message in list(
3915                        set(error_message_command_err + error_message_command_warning)
3916                    ):
3917                        log.info(f"   {message}")
3918                    # debug info
3919                    for message in list(set(error_message_command_all)):
3920                        log.debug(f"   {message}")
3921                    # failed
3922                    if len(error_message_command_err):
3923                        log.error("Annotation failed: Error in commands")
3924                        raise ValueError("Annotation failed: Error in commands")
3925
3926                    # Update variants
3927                    log.info(f"Annotation - Updating...")
3928                    self.update_from_vcf(tmp_annotate_vcf_name)
3929
3930    def annotation_exomiser(self, threads: int = None) -> None:
3931        """
3932        This function annotate with Exomiser
3933
3934        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
3935        - "analysis" (dict/file):
3936            Full analysis dictionnary parameters (see Exomiser docs).
3937            Either a dict, or a file in JSON or YAML format.
3938            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
3939            Default : None
3940        - "preset" (string):
3941            Analysis preset (available in config folder).
3942            Used if no full "analysis" is provided.
3943            Default: "exome"
3944        - "phenopacket" (dict/file):
3945            Samples and phenotipic features parameters (see Exomiser docs).
3946            Either a dict, or a file in JSON or YAML format.
3947            Default: None
3948        - "subject" (dict):
3949            Sample parameters (see Exomiser docs).
3950            Example:
3951                "subject":
3952                    {
3953                        "id": "ISDBM322017",
3954                        "sex": "FEMALE"
3955                    }
3956            Default: None
3957        - "sample" (string):
3958            Sample name to construct "subject" section:
3959                "subject":
3960                    {
3961                        "id": "<sample>",
3962                        "sex": "UNKNOWN_SEX"
3963                    }
3964            Default: None
3965        - "phenotypicFeatures" (dict)
3966            Phenotypic features to construct "subject" section.
3967            Example:
3968                "phenotypicFeatures":
3969                    [
3970                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
3971                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
3972                    ]
3973        - "hpo" (list)
3974            List of HPO ids as phenotypic features.
3975            Example:
3976                "hpo": ['0001156', '0001363', '0011304', '0010055']
3977            Default: []
3978        - "outputOptions" (dict):
3979            Output options (see Exomiser docs).
3980            Default:
3981                "output_options" =
3982                    {
3983                        "outputContributingVariantsOnly": False,
3984                        "numGenes": 0,
3985                        "outputFormats": ["TSV_VARIANT", "VCF"]
3986                    }
3987        - "transcript_source" (string):
3988            Transcript source (either "refseq", "ucsc", "ensembl")
3989            Default: "refseq"
3990        - "exomiser_to_info" (boolean):
3991            Add exomiser TSV file columns as INFO fields in VCF.
3992            Default: False
3993        - "release" (string):
3994            Exomise database release.
3995            If not exists, database release will be downloaded (take a while).
3996            Default: None (provided by application.properties configuration file)
3997        - "exomiser_application_properties" (file):
3998            Exomiser configuration file (see Exomiser docs).
3999            Useful to automatically download databases (especially for specific genome databases).
4000
4001        Notes:
4002        - If no sample in parameters, first sample in VCF will be chosen
4003        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
4004
4005        :param threads: The number of threads to use
4006        :return: None.
4007        """
4008
4009        # DEBUG
4010        log.debug("Start annotation with Exomiser databases")
4011
4012        # Threads
4013        if not threads:
4014            threads = self.get_threads()
4015        log.debug("Threads: " + str(threads))
4016
4017        # Config
4018        config = self.get_config()
4019        log.debug("Config: " + str(config))
4020
4021        # Config - Folders - Databases
4022        databases_folders = (
4023            config.get("folders", {})
4024            .get("databases", {})
4025            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
4026        )
4027        databases_folders = full_path(databases_folders)
4028        if not os.path.exists(databases_folders):
4029            log.error(f"Databases annotations: {databases_folders} NOT found")
4030        log.debug("Databases annotations: " + str(databases_folders))
4031
4032        # Config - Exomiser
4033        exomiser_bin_command = get_bin_command(
4034            bin="exomiser-cli*.jar",
4035            tool="exomiser",
4036            bin_type="jar",
4037            config=config,
4038            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
4039        )
4040        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
4041        if not exomiser_bin_command:
4042            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
4043            log.error(msg_err)
4044            raise ValueError(msg_err)
4045
4046        # Param
4047        param = self.get_param()
4048        log.debug("Param: " + str(param))
4049
4050        # Param - Exomiser
4051        param_exomiser = param.get("annotation", {}).get("exomiser", {})
4052        log.debug(f"Param Exomiser: {param_exomiser}")
4053
4054        # Param - Assembly
4055        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4056        log.debug("Assembly: " + str(assembly))
4057
4058        # Data
4059        table_variants = self.get_table_variants()
4060
4061        # Check if not empty
4062        log.debug("Check if not empty")
4063        sql_query_chromosomes = (
4064            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4065        )
4066        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4067            log.info(f"VCF empty")
4068            return False
4069
4070        # VCF header
4071        vcf_reader = self.get_header()
4072        log.debug("Initial header: " + str(vcf_reader.infos))
4073
4074        # Samples
4075        samples = self.get_header_sample_list()
4076        if not samples:
4077            log.error("No Samples in VCF")
4078            return False
4079        log.debug(f"Samples: {samples}")
4080
4081        # Memory limit
4082        memory_limit = self.get_memory("8G")
4083        log.debug(f"memory_limit: {memory_limit}")
4084
4085        # Exomiser java options
4086        exomiser_java_options = (
4087            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4088        )
4089        log.debug(f"Exomiser java options: {exomiser_java_options}")
4090
4091        # Download Exomiser (if not exists)
4092        exomiser_release = param_exomiser.get("release", None)
4093        exomiser_application_properties = param_exomiser.get(
4094            "exomiser_application_properties", None
4095        )
4096        databases_download_exomiser(
4097            assemblies=[assembly],
4098            exomiser_folder=databases_folders,
4099            exomiser_release=exomiser_release,
4100            exomiser_phenotype_release=exomiser_release,
4101            exomiser_application_properties=exomiser_application_properties,
4102        )
4103
4104        # Force annotation
4105        force_update_annotation = True
4106
4107        if "Exomiser" not in self.get_header().infos or force_update_annotation:
4108            log.debug("Start annotation Exomiser")
4109
4110            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
4111
4112                # tmp_dir = "/tmp/exomiser"
4113
4114                ### ANALYSIS ###
4115                ################
4116
4117                # Create analysis.json through analysis dict
4118                # either analysis in param or by default
4119                # depending on preset exome/genome)
4120
4121                # Init analysis dict
4122                param_exomiser_analysis_dict = {}
4123
4124                # analysis from param
4125                param_exomiser_analysis = param_exomiser.get("analysis", {})
4126                param_exomiser_analysis = full_path(param_exomiser_analysis)
4127
4128                # If analysis in param -> load anlaysis json
4129                if param_exomiser_analysis:
4130
4131                    # If param analysis is a file and exists
4132                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
4133                        param_exomiser_analysis
4134                    ):
4135                        # Load analysis file into analysis dict (either yaml or json)
4136                        with open(param_exomiser_analysis) as json_file:
4137                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
4138
4139                    # If param analysis is a dict
4140                    elif isinstance(param_exomiser_analysis, dict):
4141                        # Load analysis dict into analysis dict (either yaml or json)
4142                        param_exomiser_analysis_dict = param_exomiser_analysis
4143
4144                    # Error analysis type
4145                    else:
4146                        log.error(f"Analysis type unknown. Check param file.")
4147                        raise ValueError(f"Analysis type unknown. Check param file.")
4148
4149                # Case no input analysis config file/dict
4150                # Use preset (exome/genome) to open default config file
4151                if not param_exomiser_analysis_dict:
4152
4153                    # default preset
4154                    default_preset = "exome"
4155
4156                    # Get param preset or default preset
4157                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
4158
4159                    # Try to find if preset is a file
4160                    if os.path.exists(param_exomiser_preset):
4161                        # Preset file is provided in full path
4162                        param_exomiser_analysis_default_config_file = (
4163                            param_exomiser_preset
4164                        )
4165                    # elif os.path.exists(full_path(param_exomiser_preset)):
4166                    #     # Preset file is provided in full path
4167                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
4168                    elif os.path.exists(
4169                        os.path.join(folder_config, param_exomiser_preset)
4170                    ):
4171                        # Preset file is provided a basename in config folder (can be a path with subfolders)
4172                        param_exomiser_analysis_default_config_file = os.path.join(
4173                            folder_config, param_exomiser_preset
4174                        )
4175                    else:
4176                        # Construct preset file
4177                        param_exomiser_analysis_default_config_file = os.path.join(
4178                            folder_config,
4179                            f"preset-{param_exomiser_preset}-analysis.json",
4180                        )
4181
4182                    # If preset file exists
4183                    param_exomiser_analysis_default_config_file = full_path(
4184                        param_exomiser_analysis_default_config_file
4185                    )
4186                    if os.path.exists(param_exomiser_analysis_default_config_file):
4187                        # Load prest file into analysis dict (either yaml or json)
4188                        with open(
4189                            param_exomiser_analysis_default_config_file
4190                        ) as json_file:
4191                            # param_exomiser_analysis_dict[""] = json.load(json_file)
4192                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
4193                                json_file
4194                            )
4195
4196                    # Error preset file
4197                    else:
4198                        log.error(
4199                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4200                        )
4201                        raise ValueError(
4202                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4203                        )
4204
4205                # If no analysis dict created
4206                if not param_exomiser_analysis_dict:
4207                    log.error(f"No analysis config")
4208                    raise ValueError(f"No analysis config")
4209
4210                # Log
4211                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4212
4213                ### PHENOPACKET ###
4214                ###################
4215
4216                # If no PhenoPacket in analysis dict -> check in param
4217                if "phenopacket" not in param_exomiser_analysis_dict:
4218
4219                    # If PhenoPacket in param -> load anlaysis json
4220                    if param_exomiser.get("phenopacket", None):
4221
4222                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
4223                        param_exomiser_phenopacket = full_path(
4224                            param_exomiser_phenopacket
4225                        )
4226
4227                        # If param phenopacket is a file and exists
4228                        if isinstance(
4229                            param_exomiser_phenopacket, str
4230                        ) and os.path.exists(param_exomiser_phenopacket):
4231                            # Load phenopacket file into analysis dict (either yaml or json)
4232                            with open(param_exomiser_phenopacket) as json_file:
4233                                param_exomiser_analysis_dict["phenopacket"] = (
4234                                    yaml.safe_load(json_file)
4235                                )
4236
4237                        # If param phenopacket is a dict
4238                        elif isinstance(param_exomiser_phenopacket, dict):
4239                            # Load phenopacket dict into analysis dict (either yaml or json)
4240                            param_exomiser_analysis_dict["phenopacket"] = (
4241                                param_exomiser_phenopacket
4242                            )
4243
4244                        # Error phenopacket type
4245                        else:
4246                            log.error(f"Phenopacket type unknown. Check param file.")
4247                            raise ValueError(
4248                                f"Phenopacket type unknown. Check param file."
4249                            )
4250
4251                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
4252                if "phenopacket" not in param_exomiser_analysis_dict:
4253
4254                    # Init PhenoPacket
4255                    param_exomiser_analysis_dict["phenopacket"] = {
4256                        "id": "analysis",
4257                        "proband": {},
4258                    }
4259
4260                    ### Add subject ###
4261
4262                    # If subject exists
4263                    param_exomiser_subject = param_exomiser.get("subject", {})
4264
4265                    # If subject not exists -> found sample ID
4266                    if not param_exomiser_subject:
4267
4268                        # Found sample ID in param
4269                        sample = param_exomiser.get("sample", None)
4270
4271                        # Find sample ID (first sample)
4272                        if not sample:
4273                            sample_list = self.get_header_sample_list()
4274                            if len(sample_list) > 0:
4275                                sample = sample_list[0]
4276                            else:
4277                                log.error(f"No sample found")
4278                                raise ValueError(f"No sample found")
4279
4280                        # Create subject
4281                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
4282
4283                    # Add to dict
4284                    param_exomiser_analysis_dict["phenopacket"][
4285                        "subject"
4286                    ] = param_exomiser_subject
4287
4288                    ### Add "phenotypicFeatures" ###
4289
4290                    # If phenotypicFeatures exists
4291                    param_exomiser_phenotypicfeatures = param_exomiser.get(
4292                        "phenotypicFeatures", []
4293                    )
4294
4295                    # If phenotypicFeatures not exists -> Try to infer from hpo list
4296                    if not param_exomiser_phenotypicfeatures:
4297
4298                        # Found HPO in param
4299                        param_exomiser_hpo = param_exomiser.get("hpo", [])
4300
4301                        # Split HPO if list in string format separated by comma
4302                        if isinstance(param_exomiser_hpo, str):
4303                            param_exomiser_hpo = param_exomiser_hpo.split(",")
4304
4305                        # Create HPO list
4306                        for hpo in param_exomiser_hpo:
4307                            hpo_clean = re.sub("[^0-9]", "", hpo)
4308                            param_exomiser_phenotypicfeatures.append(
4309                                {
4310                                    "type": {
4311                                        "id": f"HP:{hpo_clean}",
4312                                        "label": f"HP:{hpo_clean}",
4313                                    }
4314                                }
4315                            )
4316
4317                    # Add to dict
4318                    param_exomiser_analysis_dict["phenopacket"][
4319                        "phenotypicFeatures"
4320                    ] = param_exomiser_phenotypicfeatures
4321
4322                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
4323                    if not param_exomiser_phenotypicfeatures:
4324                        for step in param_exomiser_analysis_dict.get(
4325                            "analysis", {}
4326                        ).get("steps", []):
4327                            if "hiPhivePrioritiser" in step:
4328                                param_exomiser_analysis_dict.get("analysis", {}).get(
4329                                    "steps", []
4330                                ).remove(step)
4331
4332                ### Add Input File ###
4333
4334                # Initial file name and htsFiles
4335                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
4336                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
4337                    {
4338                        "uri": tmp_vcf_name,
4339                        "htsFormat": "VCF",
4340                        "genomeAssembly": assembly,
4341                    }
4342                ]
4343
4344                ### Add metaData ###
4345
4346                # If metaData not in analysis dict
4347                if "metaData" not in param_exomiser_analysis_dict:
4348                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
4349                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
4350                        "createdBy": "howard",
4351                        "phenopacketSchemaVersion": 1,
4352                    }
4353
4354                ### OutputOptions ###
4355
4356                # Init output result folder
4357                output_results = os.path.join(tmp_dir, "results")
4358
4359                # If no outputOptions in analysis dict
4360                if "outputOptions" not in param_exomiser_analysis_dict:
4361
4362                    # default output formats
4363                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
4364
4365                    # Get outputOptions in param
4366                    output_options = param_exomiser.get("outputOptions", None)
4367
4368                    # If no output_options in param -> check
4369                    if not output_options:
4370                        output_options = {
4371                            "outputContributingVariantsOnly": False,
4372                            "numGenes": 0,
4373                            "outputFormats": defaut_output_formats,
4374                        }
4375
4376                    # Replace outputDirectory in output options
4377                    output_options["outputDirectory"] = output_results
4378                    output_options["outputFileName"] = "howard"
4379
4380                    # Add outputOptions in analysis dict
4381                    param_exomiser_analysis_dict["outputOptions"] = output_options
4382
4383                else:
4384
4385                    # Replace output_results and output format (if exists in param)
4386                    param_exomiser_analysis_dict["outputOptions"][
4387                        "outputDirectory"
4388                    ] = output_results
4389                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
4390                        list(
4391                            set(
4392                                param_exomiser_analysis_dict.get(
4393                                    "outputOptions", {}
4394                                ).get("outputFormats", [])
4395                                + ["TSV_VARIANT", "VCF"]
4396                            )
4397                        )
4398                    )
4399
4400                # log
4401                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4402
4403                ### ANALYSIS FILE ###
4404                #####################
4405
4406                ### Full JSON analysis config file ###
4407
4408                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
4409                with open(exomiser_analysis, "w") as fp:
4410                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
4411
4412                ### SPLIT analysis and sample config files
4413
4414                # Splitted analysis dict
4415                param_exomiser_analysis_dict_for_split = (
4416                    param_exomiser_analysis_dict.copy()
4417                )
4418
4419                # Phenopacket JSON file
4420                exomiser_analysis_phenopacket = os.path.join(
4421                    tmp_dir, "analysis_phenopacket.json"
4422                )
4423                with open(exomiser_analysis_phenopacket, "w") as fp:
4424                    json.dump(
4425                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
4426                        fp,
4427                        indent=4,
4428                    )
4429
4430                # Analysis JSON file without Phenopacket parameters
4431                param_exomiser_analysis_dict_for_split.pop("phenopacket")
4432                exomiser_analysis_analysis = os.path.join(
4433                    tmp_dir, "analysis_analysis.json"
4434                )
4435                with open(exomiser_analysis_analysis, "w") as fp:
4436                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
4437
4438                ### INITAL VCF file ###
4439                #######################
4440
4441                ### Create list of samples to use and include inti initial VCF file ####
4442
4443                # Subject (main sample)
4444                # Get sample ID in analysis dict
4445                sample_subject = (
4446                    param_exomiser_analysis_dict.get("phenopacket", {})
4447                    .get("subject", {})
4448                    .get("id", None)
4449                )
4450                sample_proband = (
4451                    param_exomiser_analysis_dict.get("phenopacket", {})
4452                    .get("proband", {})
4453                    .get("subject", {})
4454                    .get("id", None)
4455                )
4456                sample = []
4457                if sample_subject:
4458                    sample.append(sample_subject)
4459                if sample_proband:
4460                    sample.append(sample_proband)
4461
4462                # Get sample ID within Pedigree
4463                pedigree_persons_list = (
4464                    param_exomiser_analysis_dict.get("phenopacket", {})
4465                    .get("pedigree", {})
4466                    .get("persons", {})
4467                )
4468
4469                # Create list with all sample ID in pedigree (if exists)
4470                pedigree_persons = []
4471                for person in pedigree_persons_list:
4472                    pedigree_persons.append(person.get("individualId"))
4473
4474                # Concat subject sample ID and samples ID in pedigreesamples
4475                samples = list(set(sample + pedigree_persons))
4476
4477                # Check if sample list is not empty
4478                if not samples:
4479                    log.error(f"No samples found")
4480                    raise ValueError(f"No samples found")
4481
4482                # Create VCF with sample (either sample in param or first one by default)
4483                # Export VCF file
4484                self.export_variant_vcf(
4485                    vcf_file=tmp_vcf_name,
4486                    remove_info=True,
4487                    add_samples=True,
4488                    list_samples=samples,
4489                    index=False,
4490                )
4491
4492                ### Execute Exomiser ###
4493                ########################
4494
4495                # Init command
4496                exomiser_command = ""
4497
4498                # Command exomiser options
4499                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
4500
4501                # Release
4502                exomiser_release = param_exomiser.get("release", None)
4503                if exomiser_release:
4504                    # phenotype data version
4505                    exomiser_options += (
4506                        f" --exomiser.phenotype.data-version={exomiser_release} "
4507                    )
4508                    # data version
4509                    exomiser_options += (
4510                        f" --exomiser.{assembly}.data-version={exomiser_release} "
4511                    )
4512                    # variant white list
4513                    variant_white_list_file = (
4514                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
4515                    )
4516                    if os.path.exists(
4517                        os.path.join(
4518                            databases_folders, assembly, variant_white_list_file
4519                        )
4520                    ):
4521                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
4522
4523                # transcript_source
4524                transcript_source = param_exomiser.get(
4525                    "transcript_source", None
4526                )  # ucsc, refseq, ensembl
4527                if transcript_source:
4528                    exomiser_options += (
4529                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
4530                    )
4531
4532                # If analysis contain proband param
4533                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
4534                    "proband", {}
4535                ):
4536                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
4537
4538                # If no proband (usually uniq sample)
4539                else:
4540                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
4541
4542                # Log
4543                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
4544
4545                # Run command
4546                result = subprocess.call(
4547                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
4548                )
4549                if result:
4550                    log.error("Exomiser command failed")
4551                    raise ValueError("Exomiser command failed")
4552
4553                ### RESULTS ###
4554                ###############
4555
4556                ### Annotate with TSV fields ###
4557
4558                # Init result tsv file
4559                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
4560
4561                # Init result tsv file
4562                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
4563
4564                # Parse TSV file and explode columns in INFO field
4565                if exomiser_to_info and os.path.exists(output_results_tsv):
4566
4567                    # Log
4568                    log.debug("Exomiser columns to VCF INFO field")
4569
4570                    # Retrieve columns and types
4571                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
4572                    output_results_tsv_df = self.get_query_to_df(query)
4573                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
4574
4575                    # Init concat fields for update
4576                    sql_query_update_concat_fields = []
4577
4578                    # Fields to avoid
4579                    fields_to_avoid = [
4580                        "CONTIG",
4581                        "START",
4582                        "END",
4583                        "REF",
4584                        "ALT",
4585                        "QUAL",
4586                        "FILTER",
4587                        "GENOTYPE",
4588                    ]
4589
4590                    # List all columns to add into header
4591                    for header_column in output_results_tsv_columns:
4592
4593                        # If header column is enable
4594                        if header_column not in fields_to_avoid:
4595
4596                            # Header info type
4597                            header_info_type = "String"
4598                            header_column_df = output_results_tsv_df[header_column]
4599                            header_column_df_dtype = header_column_df.dtype
4600                            if header_column_df_dtype == object:
4601                                if (
4602                                    pd.to_numeric(header_column_df, errors="coerce")
4603                                    .notnull()
4604                                    .all()
4605                                ):
4606                                    header_info_type = "Float"
4607                            else:
4608                                header_info_type = "Integer"
4609
4610                            # Header info
4611                            characters_to_validate = ["-"]
4612                            pattern = "[" + "".join(characters_to_validate) + "]"
4613                            header_info_name = re.sub(
4614                                pattern,
4615                                "_",
4616                                f"Exomiser_{header_column}".replace("#", ""),
4617                            )
4618                            header_info_number = "."
4619                            header_info_description = (
4620                                f"Exomiser {header_column} annotation"
4621                            )
4622                            header_info_source = "Exomiser"
4623                            header_info_version = "unknown"
4624                            header_info_code = CODE_TYPE_MAP[header_info_type]
4625                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
4626                                header_info_name,
4627                                header_info_number,
4628                                header_info_type,
4629                                header_info_description,
4630                                header_info_source,
4631                                header_info_version,
4632                                header_info_code,
4633                            )
4634
4635                            # Add field to add for update to concat fields
4636                            sql_query_update_concat_fields.append(
4637                                f"""
4638                                CASE
4639                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
4640                                    THEN concat(
4641                                        '{header_info_name}=',
4642                                        table_parquet."{header_column}",
4643                                        ';'
4644                                        )
4645
4646                                    ELSE ''
4647                                END
4648                            """
4649                            )
4650
4651                    # Update query
4652                    sql_query_update = f"""
4653                        UPDATE {table_variants} as table_variants
4654                            SET INFO = concat(
4655                                            CASE
4656                                                WHEN INFO NOT IN ('', '.')
4657                                                THEN INFO
4658                                                ELSE ''
4659                                            END,
4660                                            CASE
4661                                                WHEN table_variants.INFO NOT IN ('','.')
4662                                                THEN ';'
4663                                                ELSE ''
4664                                            END,
4665                                            (
4666                                            SELECT 
4667                                                concat(
4668                                                    {",".join(sql_query_update_concat_fields)}
4669                                                )
4670                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
4671                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
4672                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
4673                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
4674                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
4675                                            )
4676                                        )
4677                            ;
4678                        """
4679
4680                    # Update
4681                    self.conn.execute(sql_query_update)
4682
4683                ### Annotate with VCF INFO field ###
4684
4685                # Init result VCF file
4686                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
4687
4688                # If VCF exists
4689                if os.path.exists(output_results_vcf):
4690
4691                    # Log
4692                    log.debug("Exomiser result VCF update variants")
4693
4694                    # Find Exomiser INFO field annotation in header
4695                    with gzip.open(output_results_vcf, "rt") as f:
4696                        header_list = self.read_vcf_header(f)
4697                    exomiser_vcf_header = vcf.Reader(
4698                        io.StringIO("\n".join(header_list))
4699                    )
4700
4701                    # Add annotation INFO field to header
4702                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
4703
4704                    # Update variants with VCF
4705                    self.update_from_vcf(output_results_vcf)
4706
4707        return True
4708
4709    def annotation_snpeff(self, threads: int = None) -> None:
4710        """
4711        This function annotate with snpEff
4712
4713        :param threads: The number of threads to use
4714        :return: the value of the variable "return_value".
4715        """
4716
4717        # DEBUG
4718        log.debug("Start annotation with snpeff databases")
4719
4720        # Threads
4721        if not threads:
4722            threads = self.get_threads()
4723        log.debug("Threads: " + str(threads))
4724
4725        # DEBUG
4726        delete_tmp = True
4727        if self.get_config().get("verbosity", "warning") in ["debug"]:
4728            delete_tmp = False
4729            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4730
4731        # Config
4732        config = self.get_config()
4733        log.debug("Config: " + str(config))
4734
4735        # Config - Folders - Databases
4736        databases_folders = (
4737            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
4738        )
4739        log.debug("Databases annotations: " + str(databases_folders))
4740
4741        # # Config - Java
4742        # java_bin = get_bin(
4743        #     tool="java",
4744        #     bin="java",
4745        #     bin_type="bin",
4746        #     config=config,
4747        #     default_folder="/usr/bin",
4748        # )
4749        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
4750        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
4751        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
4752
4753        # # Config - snpEff bin
4754        # snpeff_jar = get_bin(
4755        #     tool="snpeff",
4756        #     bin="snpEff.jar",
4757        #     bin_type="jar",
4758        #     config=config,
4759        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4760        # )
4761        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
4762        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4763        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4764
4765        # Config - snpEff bin command
4766        snpeff_bin_command = get_bin_command(
4767            bin="snpEff.jar",
4768            tool="snpeff",
4769            bin_type="jar",
4770            config=config,
4771            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4772        )
4773        if not snpeff_bin_command:
4774            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
4775            log.error(msg_err)
4776            raise ValueError(msg_err)
4777
4778        # Config - snpEff databases
4779        snpeff_databases = (
4780            config.get("folders", {})
4781            .get("databases", {})
4782            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
4783        )
4784        snpeff_databases = full_path(snpeff_databases)
4785        if snpeff_databases is not None and snpeff_databases != "":
4786            log.debug(f"Create snpEff databases folder")
4787            if not os.path.exists(snpeff_databases):
4788                os.makedirs(snpeff_databases)
4789
4790        # Param
4791        param = self.get_param()
4792        log.debug("Param: " + str(param))
4793
4794        # Param
4795        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
4796        log.debug("Options: " + str(options))
4797
4798        # Param - Assembly
4799        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4800
4801        # Param - Options
4802        snpeff_options = (
4803            param.get("annotation", {}).get("snpeff", {}).get("options", "")
4804        )
4805        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
4806        snpeff_csvstats = (
4807            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
4808        )
4809        if snpeff_stats:
4810            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
4811            snpeff_stats = full_path(snpeff_stats)
4812            snpeff_options += f" -stats {snpeff_stats}"
4813        if snpeff_csvstats:
4814            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
4815            snpeff_csvstats = full_path(snpeff_csvstats)
4816            snpeff_options += f" -csvStats {snpeff_csvstats}"
4817
4818        # Data
4819        table_variants = self.get_table_variants()
4820
4821        # Check if not empty
4822        log.debug("Check if not empty")
4823        sql_query_chromosomes = (
4824            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4825        )
4826        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
4827        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4828            log.info(f"VCF empty")
4829            return
4830
4831        # Export in VCF
4832        log.debug("Create initial file to annotate")
4833        tmp_vcf = NamedTemporaryFile(
4834            prefix=self.get_prefix(),
4835            dir=self.get_tmp_dir(),
4836            suffix=".vcf.gz",
4837            delete=True,
4838        )
4839        tmp_vcf_name = tmp_vcf.name
4840
4841        # VCF header
4842        vcf_reader = self.get_header()
4843        log.debug("Initial header: " + str(vcf_reader.infos))
4844
4845        # Existing annotations
4846        for vcf_annotation in self.get_header().infos:
4847
4848            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4849            log.debug(
4850                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4851            )
4852
4853        # Memory limit
4854        # if config.get("memory", None):
4855        #     memory_limit = config.get("memory", "8G")
4856        # else:
4857        #     memory_limit = "8G"
4858        memory_limit = self.get_memory("8G")
4859        log.debug(f"memory_limit: {memory_limit}")
4860
4861        # snpEff java options
4862        snpeff_java_options = (
4863            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4864        )
4865        log.debug(f"Exomiser java options: {snpeff_java_options}")
4866
4867        force_update_annotation = True
4868
4869        if "ANN" not in self.get_header().infos or force_update_annotation:
4870
4871            # Check snpEff database
4872            log.debug(f"Check snpEff databases {[assembly]}")
4873            databases_download_snpeff(
4874                folder=snpeff_databases, assemblies=[assembly], config=config
4875            )
4876
4877            # Export VCF file
4878            self.export_variant_vcf(
4879                vcf_file=tmp_vcf_name,
4880                remove_info=True,
4881                add_samples=False,
4882                index=True,
4883            )
4884
4885            # Tmp file
4886            err_files = []
4887            tmp_annotate_vcf = NamedTemporaryFile(
4888                prefix=self.get_prefix(),
4889                dir=self.get_tmp_dir(),
4890                suffix=".vcf",
4891                delete=False,
4892            )
4893            tmp_annotate_vcf_name = tmp_annotate_vcf.name
4894            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
4895            err_files.append(tmp_annotate_vcf_name_err)
4896
4897            # Command
4898            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
4899            log.debug(f"Annotation - snpEff command: {snpeff_command}")
4900            run_parallel_commands([snpeff_command], 1)
4901
4902            # Error messages
4903            log.info(f"Error/Warning messages:")
4904            error_message_command_all = []
4905            error_message_command_warning = []
4906            error_message_command_err = []
4907            for err_file in err_files:
4908                with open(err_file, "r") as f:
4909                    for line in f:
4910                        message = line.strip()
4911                        error_message_command_all.append(message)
4912                        if line.startswith("[W::"):
4913                            error_message_command_warning.append(message)
4914                        if line.startswith("[E::"):
4915                            error_message_command_err.append(f"{err_file}: " + message)
4916            # log info
4917            for message in list(
4918                set(error_message_command_err + error_message_command_warning)
4919            ):
4920                log.info(f"   {message}")
4921            # debug info
4922            for message in list(set(error_message_command_all)):
4923                log.debug(f"   {message}")
4924            # failed
4925            if len(error_message_command_err):
4926                log.error("Annotation failed: Error in commands")
4927                raise ValueError("Annotation failed: Error in commands")
4928
4929            # Find annotation in header
4930            with open(tmp_annotate_vcf_name, "rt") as f:
4931                header_list = self.read_vcf_header(f)
4932            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
4933
4934            for ann in annovar_vcf_header.infos:
4935                if ann not in self.get_header().infos:
4936                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
4937
4938            # Update variants
4939            log.info(f"Annotation - Updating...")
4940            self.update_from_vcf(tmp_annotate_vcf_name)
4941
4942        else:
4943            if "ANN" in self.get_header().infos:
4944                log.debug(f"Existing snpEff annotations in VCF")
4945            if force_update_annotation:
4946                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
4947
4948    def annotation_annovar(self, threads: int = None) -> None:
4949        """
4950        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
4951        annotations
4952
4953        :param threads: number of threads to use
4954        :return: the value of the variable "return_value".
4955        """
4956
4957        # DEBUG
4958        log.debug("Start annotation with Annovar databases")
4959
4960        # Threads
4961        if not threads:
4962            threads = self.get_threads()
4963        log.debug("Threads: " + str(threads))
4964
4965        # Tmp en Err files
4966        tmp_files = []
4967        err_files = []
4968
4969        # DEBUG
4970        delete_tmp = True
4971        if self.get_config().get("verbosity", "warning") in ["debug"]:
4972            delete_tmp = False
4973            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4974
4975        # Config
4976        config = self.get_config()
4977        log.debug("Config: " + str(config))
4978
4979        # Config - Folders - Databases
4980        databases_folders = (
4981            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
4982        )
4983        log.debug("Databases annotations: " + str(databases_folders))
4984
4985        # Config - annovar bin command
4986        annovar_bin_command = get_bin_command(
4987            bin="table_annovar.pl",
4988            tool="annovar",
4989            bin_type="perl",
4990            config=config,
4991            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
4992        )
4993        if not annovar_bin_command:
4994            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
4995            log.error(msg_err)
4996            raise ValueError(msg_err)
4997
4998        # Config - BCFTools bin command
4999        bcftools_bin_command = get_bin_command(
5000            bin="bcftools",
5001            tool="bcftools",
5002            bin_type="bin",
5003            config=config,
5004            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
5005        )
5006        if not bcftools_bin_command:
5007            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
5008            log.error(msg_err)
5009            raise ValueError(msg_err)
5010
5011        # Config - annovar databases
5012        annovar_databases = (
5013            config.get("folders", {})
5014            .get("databases", {})
5015            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
5016        )
5017        annovar_databases = full_path(annovar_databases)
5018        if annovar_databases != "" and not os.path.exists(annovar_databases):
5019            os.makedirs(annovar_databases)
5020
5021        # Param
5022        param = self.get_param()
5023        log.debug("Param: " + str(param))
5024
5025        # Param - options
5026        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
5027        log.debug("Options: " + str(options))
5028
5029        # Param - annotations
5030        annotations = (
5031            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
5032        )
5033        log.debug("Annotations: " + str(annotations))
5034
5035        # Param - Assembly
5036        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5037
5038        # Annovar database assembly
5039        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
5040        if annovar_databases_assembly != "" and not os.path.exists(
5041            annovar_databases_assembly
5042        ):
5043            os.makedirs(annovar_databases_assembly)
5044
5045        # Data
5046        table_variants = self.get_table_variants()
5047
5048        # Check if not empty
5049        log.debug("Check if not empty")
5050        sql_query_chromosomes = (
5051            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5052        )
5053        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
5054        if not sql_query_chromosomes_df["count"][0]:
5055            log.info(f"VCF empty")
5056            return
5057
5058        # VCF header
5059        vcf_reader = self.get_header()
5060        log.debug("Initial header: " + str(vcf_reader.infos))
5061
5062        # Existing annotations
5063        for vcf_annotation in self.get_header().infos:
5064
5065            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5066            log.debug(
5067                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5068            )
5069
5070        force_update_annotation = True
5071
5072        if annotations:
5073
5074            commands = []
5075            tmp_annotates_vcf_name_list = []
5076
5077            # Export in VCF
5078            log.debug("Create initial file to annotate")
5079            tmp_vcf = NamedTemporaryFile(
5080                prefix=self.get_prefix(),
5081                dir=self.get_tmp_dir(),
5082                suffix=".vcf.gz",
5083                delete=False,
5084            )
5085            tmp_vcf_name = tmp_vcf.name
5086            tmp_files.append(tmp_vcf_name)
5087            tmp_files.append(tmp_vcf_name + ".tbi")
5088
5089            # Export VCF file
5090            self.export_variant_vcf(
5091                vcf_file=tmp_vcf_name,
5092                remove_info=".",
5093                add_samples=False,
5094                index=True,
5095            )
5096
5097            # Create file for field rename
5098            log.debug("Create file for field rename")
5099            tmp_rename = NamedTemporaryFile(
5100                prefix=self.get_prefix(),
5101                dir=self.get_tmp_dir(),
5102                suffix=".rename",
5103                delete=False,
5104            )
5105            tmp_rename_name = tmp_rename.name
5106            tmp_files.append(tmp_rename_name)
5107
5108            # Check Annovar database
5109            log.debug(
5110                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
5111            )
5112            databases_download_annovar(
5113                folder=annovar_databases,
5114                files=list(annotations.keys()),
5115                assemblies=[assembly],
5116            )
5117
5118            for annotation in annotations:
5119                annotation_fields = annotations[annotation]
5120
5121                if not annotation_fields:
5122                    annotation_fields = {"INFO": None}
5123
5124                log.info(f"Annotations Annovar - database '{annotation}'")
5125                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
5126
5127                # Tmp file for annovar
5128                err_files = []
5129                tmp_annotate_vcf_directory = TemporaryDirectory(
5130                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
5131                )
5132                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
5133                tmp_annotate_vcf_name_annovar = (
5134                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
5135                )
5136                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
5137                err_files.append(tmp_annotate_vcf_name_err)
5138                tmp_files.append(tmp_annotate_vcf_name_err)
5139
5140                # Tmp file final vcf annotated by annovar
5141                tmp_annotate_vcf = NamedTemporaryFile(
5142                    prefix=self.get_prefix(),
5143                    dir=self.get_tmp_dir(),
5144                    suffix=".vcf.gz",
5145                    delete=False,
5146                )
5147                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5148                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
5149                tmp_files.append(tmp_annotate_vcf_name)
5150                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
5151
5152                # Number of fields
5153                annotation_list = []
5154                annotation_renamed_list = []
5155
5156                for annotation_field in annotation_fields:
5157
5158                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
5159                    annotation_fields_new_name = annotation_fields.get(
5160                        annotation_field, annotation_field
5161                    )
5162                    if not annotation_fields_new_name:
5163                        annotation_fields_new_name = annotation_field
5164
5165                    if (
5166                        force_update_annotation
5167                        or annotation_fields_new_name not in self.get_header().infos
5168                    ):
5169                        annotation_list.append(annotation_field)
5170                        annotation_renamed_list.append(annotation_fields_new_name)
5171                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
5172                        log.warning(
5173                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
5174                        )
5175
5176                    # Add rename info
5177                    run_parallel_commands(
5178                        [
5179                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
5180                        ],
5181                        1,
5182                    )
5183
5184                # log.debug("fields_to_removed: " + str(fields_to_removed))
5185                log.debug("annotation_list: " + str(annotation_list))
5186
5187                # protocol
5188                protocol = annotation
5189
5190                # argument
5191                argument = ""
5192
5193                # operation
5194                operation = "f"
5195                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
5196                    "ensGene"
5197                ):
5198                    operation = "g"
5199                    if options.get("genebase", None):
5200                        argument = f"""'{options.get("genebase","")}'"""
5201                elif annotation in ["cytoBand"]:
5202                    operation = "r"
5203
5204                # argument option
5205                argument_option = ""
5206                if argument != "":
5207                    argument_option = " --argument " + argument
5208
5209                # command options
5210                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
5211                for option in options:
5212                    if option not in ["genebase"]:
5213                        command_options += f""" --{option}={options[option]}"""
5214
5215                # Command
5216
5217                # Command - Annovar
5218                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
5219                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
5220
5221                # Command - start pipe
5222                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
5223
5224                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
5225                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
5226
5227                # Command - Special characters (refGene annotation)
5228                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
5229
5230                # Command - Clean empty fields (with value ".")
5231                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
5232
5233                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
5234                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
5235                if "ALL" not in annotation_list and "INFO" not in annotation_list:
5236                    # for ann in annotation_renamed_list:
5237                    for ann in annotation_list:
5238                        annovar_fields_to_keep.append(f"^INFO/{ann}")
5239
5240                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
5241
5242                # Command - indexing
5243                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
5244
5245                log.debug(f"Annotation - Annovar command: {command_annovar}")
5246                run_parallel_commands([command_annovar], 1)
5247
5248                # Error messages
5249                log.info(f"Error/Warning messages:")
5250                error_message_command_all = []
5251                error_message_command_warning = []
5252                error_message_command_err = []
5253                for err_file in err_files:
5254                    with open(err_file, "r") as f:
5255                        for line in f:
5256                            message = line.strip()
5257                            error_message_command_all.append(message)
5258                            if line.startswith("[W::") or line.startswith("WARNING"):
5259                                error_message_command_warning.append(message)
5260                            if line.startswith("[E::") or line.startswith("ERROR"):
5261                                error_message_command_err.append(
5262                                    f"{err_file}: " + message
5263                                )
5264                # log info
5265                for message in list(
5266                    set(error_message_command_err + error_message_command_warning)
5267                ):
5268                    log.info(f"   {message}")
5269                # debug info
5270                for message in list(set(error_message_command_all)):
5271                    log.debug(f"   {message}")
5272                # failed
5273                if len(error_message_command_err):
5274                    log.error("Annotation failed: Error in commands")
5275                    raise ValueError("Annotation failed: Error in commands")
5276
5277            if tmp_annotates_vcf_name_list:
5278
5279                # List of annotated files
5280                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
5281
5282                # Tmp file
5283                tmp_annotate_vcf = NamedTemporaryFile(
5284                    prefix=self.get_prefix(),
5285                    dir=self.get_tmp_dir(),
5286                    suffix=".vcf.gz",
5287                    delete=False,
5288                )
5289                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5290                tmp_files.append(tmp_annotate_vcf_name)
5291                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5292                err_files.append(tmp_annotate_vcf_name_err)
5293                tmp_files.append(tmp_annotate_vcf_name_err)
5294
5295                # Command merge
5296                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
5297                log.info(
5298                    f"Annotation Annovar - Annotation merging "
5299                    + str(len(tmp_annotates_vcf_name_list))
5300                    + " annotated files"
5301                )
5302                log.debug(f"Annotation - merge command: {merge_command}")
5303                run_parallel_commands([merge_command], 1)
5304
5305                # Find annotation in header
5306                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
5307                    header_list = self.read_vcf_header(f)
5308                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5309
5310                for ann in annovar_vcf_header.infos:
5311                    if ann not in self.get_header().infos:
5312                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5313
5314                # Update variants
5315                log.info(f"Annotation Annovar - Updating...")
5316                self.update_from_vcf(tmp_annotate_vcf_name)
5317
5318            # Clean files
5319            # Tmp file remove command
5320            if True:
5321                tmp_files_remove_command = ""
5322                if tmp_files:
5323                    tmp_files_remove_command = " ".join(tmp_files)
5324                clean_command = f" rm -f {tmp_files_remove_command} "
5325                log.debug(f"Annotation Annovar - Annotation cleaning ")
5326                log.debug(f"Annotation - cleaning command: {clean_command}")
5327                run_parallel_commands([clean_command], 1)
5328
5329    # Parquet
5330    def annotation_parquet(self, threads: int = None) -> None:
5331        """
5332        It takes a VCF file, and annotates it with a parquet file
5333
5334        :param threads: number of threads to use for the annotation
5335        :return: the value of the variable "result".
5336        """
5337
5338        # DEBUG
5339        log.debug("Start annotation with parquet databases")
5340
5341        # Threads
5342        if not threads:
5343            threads = self.get_threads()
5344        log.debug("Threads: " + str(threads))
5345
5346        # DEBUG
5347        delete_tmp = True
5348        if self.get_config().get("verbosity", "warning") in ["debug"]:
5349            delete_tmp = False
5350            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5351
5352        # Config
5353        databases_folders = set(
5354            self.get_config()
5355            .get("folders", {})
5356            .get("databases", {})
5357            .get("annotations", ["."])
5358            + self.get_config()
5359            .get("folders", {})
5360            .get("databases", {})
5361            .get("parquet", ["."])
5362        )
5363        log.debug("Databases annotations: " + str(databases_folders))
5364
5365        # Param
5366        annotations = (
5367            self.get_param()
5368            .get("annotation", {})
5369            .get("parquet", {})
5370            .get("annotations", None)
5371        )
5372        log.debug("Annotations: " + str(annotations))
5373
5374        # Assembly
5375        assembly = self.get_param().get(
5376            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
5377        )
5378
5379        # Force Update Annotation
5380        force_update_annotation = (
5381            self.get_param()
5382            .get("annotation", {})
5383            .get("options", {})
5384            .get("annotations_update", False)
5385        )
5386        log.debug(f"force_update_annotation={force_update_annotation}")
5387        force_append_annotation = (
5388            self.get_param()
5389            .get("annotation", {})
5390            .get("options", {})
5391            .get("annotations_append", False)
5392        )
5393        log.debug(f"force_append_annotation={force_append_annotation}")
5394
5395        # Data
5396        table_variants = self.get_table_variants()
5397
5398        # Check if not empty
5399        log.debug("Check if not empty")
5400        sql_query_chromosomes_df = self.get_query_to_df(
5401            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
5402        )
5403        if not sql_query_chromosomes_df["count"][0]:
5404            log.info(f"VCF empty")
5405            return
5406
5407        # VCF header
5408        vcf_reader = self.get_header()
5409        log.debug("Initial header: " + str(vcf_reader.infos))
5410
5411        # Nb Variants POS
5412        log.debug("NB Variants Start")
5413        nb_variants = self.conn.execute(
5414            f"SELECT count(*) AS count FROM variants"
5415        ).fetchdf()["count"][0]
5416        log.debug("NB Variants Stop")
5417
5418        # Existing annotations
5419        for vcf_annotation in self.get_header().infos:
5420
5421            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5422            log.debug(
5423                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5424            )
5425
5426        # Added columns
5427        added_columns = []
5428
5429        # drop indexes
5430        log.debug(f"Drop indexes...")
5431        self.drop_indexes()
5432
5433        if annotations:
5434
5435            if "ALL" in annotations:
5436
5437                all_param = annotations.get("ALL", {})
5438                all_param_formats = all_param.get("formats", None)
5439                all_param_releases = all_param.get("releases", None)
5440
5441                databases_infos_dict = self.scan_databases(
5442                    database_formats=all_param_formats,
5443                    database_releases=all_param_releases,
5444                )
5445                for database_infos in databases_infos_dict.keys():
5446                    if database_infos not in annotations:
5447                        annotations[database_infos] = {"INFO": None}
5448
5449            for annotation in annotations:
5450
5451                if annotation in ["ALL"]:
5452                    continue
5453
5454                # Annotation Name
5455                annotation_name = os.path.basename(annotation)
5456
5457                # Annotation fields
5458                annotation_fields = annotations[annotation]
5459                if not annotation_fields:
5460                    annotation_fields = {"INFO": None}
5461
5462                log.debug(f"Annotation '{annotation_name}'")
5463                log.debug(
5464                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
5465                )
5466
5467                # Create Database
5468                database = Database(
5469                    database=annotation,
5470                    databases_folders=databases_folders,
5471                    assembly=assembly,
5472                )
5473
5474                # Find files
5475                parquet_file = database.get_database()
5476                parquet_hdr_file = database.get_header_file()
5477                parquet_type = database.get_type()
5478
5479                # Check if files exists
5480                if not parquet_file or not parquet_hdr_file:
5481                    log.error("Annotation failed: file not found")
5482                    raise ValueError("Annotation failed: file not found")
5483                else:
5484                    # Get parquet connexion
5485                    parquet_sql_attach = database.get_sql_database_attach(
5486                        output="query"
5487                    )
5488                    if parquet_sql_attach:
5489                        self.conn.execute(parquet_sql_attach)
5490                    parquet_file_link = database.get_sql_database_link()
5491                    # Log
5492                    log.debug(
5493                        f"Annotation '{annotation_name}' - file: "
5494                        + str(parquet_file)
5495                        + " and "
5496                        + str(parquet_hdr_file)
5497                    )
5498
5499                    # Database full header columns
5500                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
5501                        parquet_hdr_file
5502                    )
5503                    # Log
5504                    log.debug(
5505                        "Annotation database header columns : "
5506                        + str(parquet_hdr_vcf_header_columns)
5507                    )
5508
5509                    # Load header as VCF object
5510                    parquet_hdr_vcf_header_infos = database.get_header().infos
5511                    # Log
5512                    log.debug(
5513                        "Annotation database header: "
5514                        + str(parquet_hdr_vcf_header_infos)
5515                    )
5516
5517                    # Get extra infos
5518                    parquet_columns = database.get_extra_columns()
5519                    # Log
5520                    log.debug("Annotation database Columns: " + str(parquet_columns))
5521
5522                    # Add extra columns if "ALL" in annotation_fields
5523                    # if "ALL" in annotation_fields:
5524                    #     allow_add_extra_column = True
5525                    if "ALL" in annotation_fields and database.get_extra_columns():
5526                        for extra_column in database.get_extra_columns():
5527                            if (
5528                                extra_column not in annotation_fields
5529                                and extra_column.replace("INFO/", "")
5530                                not in parquet_hdr_vcf_header_infos
5531                            ):
5532                                parquet_hdr_vcf_header_infos[extra_column] = (
5533                                    vcf.parser._Info(
5534                                        extra_column,
5535                                        ".",
5536                                        "String",
5537                                        f"{extra_column} description",
5538                                        "unknown",
5539                                        "unknown",
5540                                        self.code_type_map["String"],
5541                                    )
5542                                )
5543
5544                    # For all fields in database
5545                    annotation_fields_all = False
5546                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
5547                        annotation_fields_all = True
5548                        annotation_fields = {
5549                            key: key for key in parquet_hdr_vcf_header_infos
5550                        }
5551
5552                        log.debug(
5553                            "Annotation database header - All annotations added: "
5554                            + str(annotation_fields)
5555                        )
5556
5557                    # Init
5558
5559                    # List of annotation fields to use
5560                    sql_query_annotation_update_info_sets = []
5561
5562                    # List of annotation to agregate
5563                    sql_query_annotation_to_agregate = []
5564
5565                    # Number of fields
5566                    nb_annotation_field = 0
5567
5568                    # Annotation fields processed
5569                    annotation_fields_processed = []
5570
5571                    # Columns mapping
5572                    map_columns = database.map_columns(
5573                        columns=annotation_fields, prefixes=["INFO/"]
5574                    )
5575
5576                    # Query dict for fields to remove (update option)
5577                    query_dict_remove = {}
5578
5579                    # Fetch Anotation fields
5580                    for annotation_field in annotation_fields:
5581
5582                        # annotation_field_column
5583                        annotation_field_column = map_columns.get(
5584                            annotation_field, "INFO"
5585                        )
5586
5587                        # field new name, if parametered
5588                        annotation_fields_new_name = annotation_fields.get(
5589                            annotation_field, annotation_field
5590                        )
5591                        if not annotation_fields_new_name:
5592                            annotation_fields_new_name = annotation_field
5593
5594                        # To annotate
5595                        # force_update_annotation = True
5596                        # force_append_annotation = True
5597                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
5598                        if annotation_field in parquet_hdr_vcf_header_infos and (
5599                            force_update_annotation
5600                            or force_append_annotation
5601                            or (
5602                                annotation_fields_new_name
5603                                not in self.get_header().infos
5604                            )
5605                        ):
5606
5607                            # Add field to annotation to process list
5608                            annotation_fields_processed.append(
5609                                annotation_fields_new_name
5610                            )
5611
5612                            # explode infos for the field
5613                            annotation_fields_new_name_info_msg = ""
5614                            if (
5615                                force_update_annotation
5616                                and annotation_fields_new_name
5617                                in self.get_header().infos
5618                            ):
5619                                # Remove field from INFO
5620                                query = f"""
5621                                    UPDATE {table_variants} as table_variants
5622                                    SET INFO = REGEXP_REPLACE(
5623                                                concat(table_variants.INFO,''),
5624                                                ';*{annotation_fields_new_name}=[^;]*',
5625                                                ''
5626                                                )
5627                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
5628                                """
5629                                annotation_fields_new_name_info_msg = " [update]"
5630                                query_dict_remove[
5631                                    f"remove 'INFO/{annotation_fields_new_name}'"
5632                                ] = query
5633
5634                            # Sep between fields in INFO
5635                            nb_annotation_field += 1
5636                            if nb_annotation_field > 1:
5637                                annotation_field_sep = ";"
5638                            else:
5639                                annotation_field_sep = ""
5640
5641                            log.info(
5642                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
5643                            )
5644
5645                            # Add INFO field to header
5646                            parquet_hdr_vcf_header_infos_number = (
5647                                parquet_hdr_vcf_header_infos[annotation_field].num
5648                                or "."
5649                            )
5650                            parquet_hdr_vcf_header_infos_type = (
5651                                parquet_hdr_vcf_header_infos[annotation_field].type
5652                                or "String"
5653                            )
5654                            parquet_hdr_vcf_header_infos_description = (
5655                                parquet_hdr_vcf_header_infos[annotation_field].desc
5656                                or f"{annotation_field} description"
5657                            )
5658                            parquet_hdr_vcf_header_infos_source = (
5659                                parquet_hdr_vcf_header_infos[annotation_field].source
5660                                or "unknown"
5661                            )
5662                            parquet_hdr_vcf_header_infos_version = (
5663                                parquet_hdr_vcf_header_infos[annotation_field].version
5664                                or "unknown"
5665                            )
5666
5667                            vcf_reader.infos[annotation_fields_new_name] = (
5668                                vcf.parser._Info(
5669                                    annotation_fields_new_name,
5670                                    parquet_hdr_vcf_header_infos_number,
5671                                    parquet_hdr_vcf_header_infos_type,
5672                                    parquet_hdr_vcf_header_infos_description,
5673                                    parquet_hdr_vcf_header_infos_source,
5674                                    parquet_hdr_vcf_header_infos_version,
5675                                    self.code_type_map[
5676                                        parquet_hdr_vcf_header_infos_type
5677                                    ],
5678                                )
5679                            )
5680
5681                            # Append
5682                            if force_append_annotation:
5683                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
5684                            else:
5685                                query_case_when_append = ""
5686
5687                            # Annotation/Update query fields
5688                            # Found in INFO column
5689                            if (
5690                                annotation_field_column == "INFO"
5691                                and "INFO" in parquet_hdr_vcf_header_columns
5692                            ):
5693                                sql_query_annotation_update_info_sets.append(
5694                                    f"""
5695                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
5696                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
5697                                        ELSE ''
5698                                    END
5699                                """
5700                                )
5701                            # Found in a specific column
5702                            else:
5703                                sql_query_annotation_update_info_sets.append(
5704                                    f"""
5705                                CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append}
5706                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ','))
5707                                        ELSE ''
5708                                    END
5709                                """
5710                                )
5711                                sql_query_annotation_to_agregate.append(
5712                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
5713                                )
5714
5715                        # Not to annotate
5716                        else:
5717
5718                            if force_update_annotation:
5719                                annotation_message = "forced"
5720                            else:
5721                                annotation_message = "skipped"
5722
5723                            if annotation_field not in parquet_hdr_vcf_header_infos:
5724                                log.warning(
5725                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
5726                                )
5727                            if annotation_fields_new_name in self.get_header().infos:
5728                                log.warning(
5729                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
5730                                )
5731
5732                    # Check if ALL fields have to be annotated. Thus concat all INFO field
5733                    # allow_annotation_full_info = True
5734                    allow_annotation_full_info = not force_append_annotation
5735
5736                    if parquet_type in ["regions"]:
5737                        allow_annotation_full_info = False
5738
5739                    if (
5740                        allow_annotation_full_info
5741                        and nb_annotation_field == len(annotation_fields)
5742                        and annotation_fields_all
5743                        and (
5744                            "INFO" in parquet_hdr_vcf_header_columns
5745                            and "INFO" in database.get_extra_columns()
5746                        )
5747                    ):
5748                        log.debug("Column INFO annotation enabled")
5749                        sql_query_annotation_update_info_sets = []
5750                        sql_query_annotation_update_info_sets.append(
5751                            f" table_parquet.INFO "
5752                        )
5753
5754                    if sql_query_annotation_update_info_sets:
5755
5756                        # Annotate
5757                        log.info(f"Annotation '{annotation_name}' - Annotation...")
5758
5759                        # Join query annotation update info sets for SQL
5760                        sql_query_annotation_update_info_sets_sql = ",".join(
5761                            sql_query_annotation_update_info_sets
5762                        )
5763
5764                        # Check chromosomes list (and variants infos)
5765                        sql_query_chromosomes = f"""
5766                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
5767                            FROM {table_variants} as table_variants
5768                            GROUP BY table_variants."#CHROM"
5769                            ORDER BY table_variants."#CHROM"
5770                            """
5771                        sql_query_chromosomes_df = self.conn.execute(
5772                            sql_query_chromosomes
5773                        ).df()
5774                        sql_query_chromosomes_dict = {
5775                            entry["CHROM"]: {
5776                                "count": entry["count_variants"],
5777                                "min": entry["min_variants"],
5778                                "max": entry["max_variants"],
5779                            }
5780                            for index, entry in sql_query_chromosomes_df.iterrows()
5781                        }
5782
5783                        # Init
5784                        nb_of_query = 0
5785                        nb_of_variant_annotated = 0
5786                        query_dict = query_dict_remove
5787
5788                        # for chrom in sql_query_chromosomes_df["CHROM"]:
5789                        for chrom in sql_query_chromosomes_dict:
5790
5791                            # Number of variant by chromosome
5792                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
5793                                chrom, {}
5794                            ).get("count", 0)
5795
5796                            log.debug(
5797                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
5798                            )
5799
5800                            # Annotation with regions database
5801                            if parquet_type in ["regions"]:
5802                                sql_query_annotation_from_clause = f"""
5803                                    FROM (
5804                                        SELECT 
5805                                            '{chrom}' AS \"#CHROM\",
5806                                            table_variants_from.\"POS\" AS \"POS\",
5807                                            {",".join(sql_query_annotation_to_agregate)}
5808                                        FROM {table_variants} as table_variants_from
5809                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
5810                                            table_parquet_from."#CHROM" = '{chrom}'
5811                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
5812                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
5813                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
5814                                                )
5815                                        )
5816                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
5817                                        GROUP BY table_variants_from.\"POS\"
5818                                        )
5819                                        as table_parquet
5820                                """
5821
5822                                sql_query_annotation_where_clause = """
5823                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
5824                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5825                                """
5826
5827                            # Annotation with variants database
5828                            else:
5829                                sql_query_annotation_from_clause = f"""
5830                                    FROM {parquet_file_link} as table_parquet
5831                                """
5832                                sql_query_annotation_where_clause = f"""
5833                                    table_variants."#CHROM" = '{chrom}'
5834                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
5835                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5836                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
5837                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
5838                                """
5839
5840                            # Create update query
5841                            sql_query_annotation_chrom_interval_pos = f"""
5842                                UPDATE {table_variants} as table_variants
5843                                    SET INFO = 
5844                                        concat(
5845                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5846                                                THEN table_variants.INFO
5847                                                ELSE ''
5848                                            END
5849                                            ,
5850                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5851                                                        AND (
5852                                                        concat({sql_query_annotation_update_info_sets_sql})
5853                                                        )
5854                                                        NOT IN ('','.') 
5855                                                    THEN ';'
5856                                                    ELSE ''
5857                                            END
5858                                            ,
5859                                            {sql_query_annotation_update_info_sets_sql}
5860                                            )
5861                                    {sql_query_annotation_from_clause}
5862                                    WHERE {sql_query_annotation_where_clause}
5863                                    ;
5864                                """
5865
5866                            # Add update query to dict
5867                            query_dict[
5868                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
5869                            ] = sql_query_annotation_chrom_interval_pos
5870
5871                        nb_of_query = len(query_dict)
5872                        num_query = 0
5873
5874                        # SET max_expression_depth TO x
5875                        self.conn.execute("SET max_expression_depth TO 10000")
5876
5877                        for query_name in query_dict:
5878                            query = query_dict[query_name]
5879                            num_query += 1
5880                            log.info(
5881                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
5882                            )
5883                            result = self.conn.execute(query)
5884                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
5885                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
5886                            log.info(
5887                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
5888                            )
5889
5890                        log.info(
5891                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
5892                        )
5893
5894                    else:
5895
5896                        log.info(
5897                            f"Annotation '{annotation_name}' - No Annotations available"
5898                        )
5899
5900                    log.debug("Final header: " + str(vcf_reader.infos))
5901
5902        # Remove added columns
5903        for added_column in added_columns:
5904            self.drop_column(column=added_column)
5905
5906    def annotation_splice(self, threads: int = None) -> None:
5907        """
5908        This function annotate with snpEff
5909
5910        :param threads: The number of threads to use
5911        :return: the value of the variable "return_value".
5912        """
5913
5914        # DEBUG
5915        log.debug("Start annotation with splice tools")
5916
5917        # Threads
5918        if not threads:
5919            threads = self.get_threads()
5920        log.debug("Threads: " + str(threads))
5921
5922        # DEBUG
5923        delete_tmp = True
5924        if self.get_config().get("verbosity", "warning") in ["debug"]:
5925            delete_tmp = False
5926            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5927
5928        # Config
5929        config = self.get_config()
5930        log.debug("Config: " + str(config))
5931        splice_config = config.get("tools", {}).get("splice", {})
5932        if not splice_config:
5933            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
5934        if not splice_config:
5935            msg_err = "No Splice tool config"
5936            log.error(msg_err)
5937            raise ValueError(msg_err)
5938        log.debug(f"splice_config={splice_config}")
5939
5940        # Config - Folders - Databases
5941        databases_folders = (
5942            config.get("folders", {}).get("databases", {}).get("splice", ["."])
5943        )
5944        log.debug("Databases annotations: " + str(databases_folders))
5945
5946        # Splice docker image
5947        splice_docker_image = splice_config.get("docker").get("image")
5948
5949        # Pull splice image if it's not already there
5950        if not check_docker_image_exists(splice_docker_image):
5951            log.warning(
5952                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
5953            )
5954            try:
5955                command(f"docker pull {splice_config.get('docker').get('image')}")
5956            except subprocess.CalledProcessError:
5957                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
5958                log.error(msg_err)
5959                raise ValueError(msg_err)
5960                return None
5961
5962        # Config - splice databases
5963        splice_databases = (
5964            config.get("folders", {})
5965            .get("databases", {})
5966            .get("splice", DEFAULT_SPLICE_FOLDER)
5967        )
5968        splice_databases = full_path(splice_databases)
5969
5970        # Param
5971        param = self.get_param()
5972        log.debug("Param: " + str(param))
5973
5974        # Param
5975        options = param.get("annotation", {}).get("splice", {})
5976        log.debug("Options: " + str(options))
5977
5978        # Data
5979        table_variants = self.get_table_variants()
5980
5981        # Check if not empty
5982        log.debug("Check if not empty")
5983        sql_query_chromosomes = (
5984            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5985        )
5986        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
5987            log.info("VCF empty")
5988            return None
5989
5990        # Export in VCF
5991        log.debug("Create initial file to annotate")
5992
5993        # Create output folder
5994        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
5995        if not os.path.exists(output_folder):
5996            Path(output_folder).mkdir(parents=True, exist_ok=True)
5997
5998        # Create tmp VCF file
5999        tmp_vcf = NamedTemporaryFile(
6000            prefix=self.get_prefix(),
6001            dir=output_folder,
6002            suffix=".vcf",
6003            delete=False,
6004        )
6005        tmp_vcf_name = tmp_vcf.name
6006
6007        # VCF header
6008        header = self.get_header()
6009
6010        # Existing annotations
6011        for vcf_annotation in self.get_header().infos:
6012
6013            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
6014            log.debug(
6015                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
6016            )
6017
6018        # Memory limit
6019        if config.get("memory", None):
6020            memory_limit = config.get("memory", "8G").upper()
6021            # upper()
6022        else:
6023            memory_limit = "8G"
6024        log.debug(f"memory_limit: {memory_limit}")
6025
6026        # Check number of variants to annotate
6027        where_clause_regex_spliceai = r"SpliceAI_\w+"
6028        where_clause_regex_spip = r"SPiP_\w+"
6029        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
6030        df_list_of_variants_to_annotate = self.get_query_to_df(
6031            query=f""" SELECT * FROM variants {where_clause} """
6032        )
6033        if len(df_list_of_variants_to_annotate) == 0:
6034            log.warning(
6035                f"No variants to annotate with splice. Variants probably already annotated with splice"
6036            )
6037            return None
6038        else:
6039            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
6040
6041        # Export VCF file
6042        self.export_variant_vcf(
6043            vcf_file=tmp_vcf_name,
6044            remove_info=True,
6045            add_samples=True,
6046            index=False,
6047            where_clause=where_clause,
6048        )
6049
6050        # Create docker container and launch splice analysis
6051        if splice_config:
6052
6053            # Splice mount folders
6054            mount_folders = splice_config.get("mount", {})
6055
6056            # Genome mount
6057            mount_folders[
6058                config.get("folders", {})
6059                .get("databases", {})
6060                .get("genomes", DEFAULT_GENOME_FOLDER)
6061            ] = "ro"
6062
6063            # SpliceAI mount
6064            mount_folders[
6065                config.get("folders", {})
6066                .get("databases", {})
6067                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
6068            ] = "ro"
6069
6070            # Genome mount
6071            mount_folders[
6072                config.get("folders", {})
6073                .get("databases", {})
6074                .get("spip", DEFAULT_SPIP_FOLDER)
6075            ] = "ro"
6076
6077            # Mount folders
6078            mount = []
6079
6080            # Config mount
6081            mount = [
6082                f"-v {full_path(path)}:{full_path(path)}:{mode}"
6083                for path, mode in mount_folders.items()
6084            ]
6085
6086            if any(value for value in splice_config.values() if value is None):
6087                log.warning("At least one splice config parameter is empty")
6088                return None
6089
6090            # Params in splice nf
6091            def check_values(dico: dict):
6092                """
6093                Ensure parameters for NF splice pipeline
6094                """
6095                for key, val in dico.items():
6096                    if key == "genome":
6097                        if any(
6098                            assemb in options.get("genome", {})
6099                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
6100                        ):
6101                            yield f"--{key} hg19"
6102                        elif any(
6103                            assemb in options.get("genome", {})
6104                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
6105                        ):
6106                            yield f"--{key} hg38"
6107                    elif (
6108                        (isinstance(val, str) and val)
6109                        or isinstance(val, int)
6110                        or isinstance(val, bool)
6111                    ):
6112                        yield f"--{key} {val}"
6113
6114            # Genome
6115            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
6116            options["genome"] = genome
6117
6118            # NF params
6119            nf_params = []
6120
6121            # Add options
6122            if options:
6123                nf_params = list(check_values(options))
6124                log.debug(f"Splice NF params: {' '.join(nf_params)}")
6125            else:
6126                log.debug("No NF params provided")
6127
6128            # Add threads
6129            if "threads" not in options.keys():
6130                nf_params.append(f"--threads {threads}")
6131
6132            # Genome path
6133            genome_path = find_genome(
6134                config.get("folders", {})
6135                .get("databases", {})
6136                .get("genomes", DEFAULT_GENOME_FOLDER),
6137                file=f"{genome}.fa",
6138            )
6139            # Add genome path
6140            if not genome_path:
6141                raise ValueError(
6142                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
6143                )
6144            else:
6145                log.debug(f"Genome: {genome_path}")
6146                nf_params.append(f"--genome_path {genome_path}")
6147
6148            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
6149                """
6150                Setting up updated databases for SPiP and SpliceAI
6151                """
6152
6153                try:
6154
6155                    # SpliceAI assembly transcriptome
6156                    spliceai_assembly = os.path.join(
6157                        config.get("folders", {})
6158                        .get("databases", {})
6159                        .get("spliceai", {}),
6160                        options.get("genome"),
6161                        "transcriptome",
6162                    )
6163                    spip_assembly = options.get("genome")
6164
6165                    spip = find(
6166                        f"transcriptome_{spip_assembly}.RData",
6167                        config.get("folders", {}).get("databases", {}).get("spip", {}),
6168                    )
6169                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
6170                    log.debug(f"SPiP annotations: {spip}")
6171                    log.debug(f"SpliceAI annotations: {spliceai}")
6172                    if spip and spliceai:
6173                        return [
6174                            f"--spip_transcriptome {spip}",
6175                            f"--spliceai_annotations {spliceai}",
6176                        ]
6177                    else:
6178                        # TODO crash and go on with basic annotations ?
6179                        # raise ValueError(
6180                        #     "Can't find splice databases in configuration EXIT"
6181                        # )
6182                        log.warning(
6183                            "Can't find splice databases in configuration, use annotations file from image"
6184                        )
6185                except TypeError:
6186                    log.warning(
6187                        "Can't find splice databases in configuration, use annotations file from image"
6188                    )
6189                    return []
6190
6191            # Add options, check if transcriptome option have already beend provided
6192            if (
6193                "spip_transcriptome" not in nf_params
6194                and "spliceai_transcriptome" not in nf_params
6195            ):
6196                splice_reference = splice_annotations(options, config)
6197                if splice_reference:
6198                    nf_params.extend(splice_reference)
6199
6200            nf_params.append(f"--output_folder {output_folder}")
6201
6202            random_uuid = f"HOWARD-SPLICE-{get_random()}"
6203            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
6204            log.debug(cmd)
6205
6206            splice_config["docker"]["command"] = cmd
6207
6208            docker_cmd = get_bin_command(
6209                tool="splice",
6210                bin_type="docker",
6211                config=config,
6212                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
6213                add_options=f"--name {random_uuid} {' '.join(mount)}",
6214            )
6215
6216            # Docker debug
6217            # if splice_config.get("rm_container"):
6218            #     rm_container = "--rm"
6219            # else:
6220            #     rm_container = ""
6221            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
6222
6223            log.debug(docker_cmd)
6224            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
6225            log.debug(res.stdout)
6226            if res.stderr:
6227                log.error(res.stderr)
6228            res.check_returncode()
6229        else:
6230            log.warning(f"Splice tool configuration not found: {config}")
6231
6232        # Update variants
6233        log.info("Annotation - Updating...")
6234        # Test find output vcf
6235        log.debug(
6236            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6237        )
6238        output_vcf = []
6239        # Wrong folder to look in
6240        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
6241            if (
6242                files
6243                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6244            ):
6245                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
6246        # log.debug(os.listdir(options.get("output_folder")))
6247        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
6248        if not output_vcf:
6249            log.debug(
6250                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
6251            )
6252        else:
6253            # Get new header from annotated vcf
6254            log.debug(f"Initial header: {len(header.infos)} fields")
6255            # Create new header with splice infos
6256            new_vcf = Variants(input=output_vcf[0])
6257            new_vcf_header = new_vcf.get_header().infos
6258            for keys, infos in new_vcf_header.items():
6259                if keys not in header.infos.keys():
6260                    header.infos[keys] = infos
6261            log.debug(f"New header: {len(header.infos)} fields")
6262            log.debug(f"Splice tmp output: {output_vcf[0]}")
6263            self.update_from_vcf(output_vcf[0])
6264
6265        # Remove folder
6266        remove_if_exists(output_folder)
6267
6268    ###
6269    # Prioritization
6270    ###
6271
6272    def get_config_default(self, name: str) -> dict:
6273        """
6274        The function `get_config_default` returns a dictionary containing default configurations for
6275        various calculations and prioritizations.
6276
6277        :param name: The `get_config_default` function returns a dictionary containing default
6278        configurations for different calculations and prioritizations. The `name` parameter is used to
6279        specify which specific configuration to retrieve from the dictionary
6280        :type name: str
6281        :return: The function `get_config_default` returns a dictionary containing default configuration
6282        settings for different calculations and prioritizations. The specific configuration settings are
6283        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
6284        matches a key in the `config_default` dictionary, the corresponding configuration settings are
6285        returned. If there is no match, an empty dictionary is returned.
6286        """
6287
6288        config_default = {
6289            "calculations": {
6290                "variant_chr_pos_alt_ref": {
6291                    "type": "sql",
6292                    "name": "variant_chr_pos_alt_ref",
6293                    "description": "Create a variant ID with chromosome, position, alt and ref",
6294                    "available": False,
6295                    "output_column_name": "variant_chr_pos_alt_ref",
6296                    "output_column_type": "String",
6297                    "output_column_description": "variant ID with chromosome, position, alt and ref",
6298                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
6299                    "operation_info": True,
6300                },
6301                "VARTYPE": {
6302                    "type": "sql",
6303                    "name": "VARTYPE",
6304                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
6305                    "available": True,
6306                    "output_column_name": "VARTYPE",
6307                    "output_column_type": "String",
6308                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
6309                    "operation_query": """
6310                            CASE
6311                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
6312                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
6313                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
6314                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
6315                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
6316                                ELSE 'UNDEFINED'
6317                            END
6318                            """,
6319                    "info_fields": ["SVTYPE"],
6320                    "operation_info": True,
6321                },
6322                "snpeff_hgvs": {
6323                    "type": "python",
6324                    "name": "snpeff_hgvs",
6325                    "description": "HGVS nomenclatures from snpEff annotation",
6326                    "available": True,
6327                    "function_name": "calculation_extract_snpeff_hgvs",
6328                    "function_params": ["snpeff_hgvs", "ANN"],
6329                },
6330                "snpeff_ann_explode": {
6331                    "type": "python",
6332                    "name": "snpeff_ann_explode",
6333                    "description": "Explode snpEff annotations with uniquify values",
6334                    "available": True,
6335                    "function_name": "calculation_snpeff_ann_explode",
6336                    "function_params": [False, "fields", "snpeff_", "ANN"],
6337                },
6338                "snpeff_ann_explode_uniquify": {
6339                    "type": "python",
6340                    "name": "snpeff_ann_explode_uniquify",
6341                    "description": "Explode snpEff annotations",
6342                    "available": True,
6343                    "function_name": "calculation_snpeff_ann_explode",
6344                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
6345                },
6346                "snpeff_ann_explode_json": {
6347                    "type": "python",
6348                    "name": "snpeff_ann_explode_json",
6349                    "description": "Explode snpEff annotations in JSON format",
6350                    "available": True,
6351                    "function_name": "calculation_snpeff_ann_explode",
6352                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
6353                },
6354                "NOMEN": {
6355                    "type": "python",
6356                    "name": "NOMEN",
6357                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
6358                    "available": True,
6359                    "function_name": "calculation_extract_nomen",
6360                    "function_params": [],
6361                },
6362                "FINDBYPIPELINE": {
6363                    "type": "python",
6364                    "name": "FINDBYPIPELINE",
6365                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
6366                    "available": True,
6367                    "function_name": "calculation_find_by_pipeline",
6368                    "function_params": ["findbypipeline"],
6369                },
6370                "FINDBYSAMPLE": {
6371                    "type": "python",
6372                    "name": "FINDBYSAMPLE",
6373                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
6374                    "available": True,
6375                    "function_name": "calculation_find_by_pipeline",
6376                    "function_params": ["findbysample"],
6377                },
6378                "GENOTYPECONCORDANCE": {
6379                    "type": "python",
6380                    "name": "GENOTYPECONCORDANCE",
6381                    "description": "Concordance of genotype for multi caller VCF",
6382                    "available": True,
6383                    "function_name": "calculation_genotype_concordance",
6384                    "function_params": [],
6385                },
6386                "BARCODE": {
6387                    "type": "python",
6388                    "name": "BARCODE",
6389                    "description": "BARCODE as VaRank tool",
6390                    "available": True,
6391                    "function_name": "calculation_barcode",
6392                    "function_params": [],
6393                },
6394                "BARCODEFAMILY": {
6395                    "type": "python",
6396                    "name": "BARCODEFAMILY",
6397                    "description": "BARCODEFAMILY as VaRank tool",
6398                    "available": True,
6399                    "function_name": "calculation_barcode_family",
6400                    "function_params": ["BCF"],
6401                },
6402                "TRIO": {
6403                    "type": "python",
6404                    "name": "TRIO",
6405                    "description": "Inheritance for a trio family",
6406                    "available": True,
6407                    "function_name": "calculation_trio",
6408                    "function_params": [],
6409                },
6410                "VAF": {
6411                    "type": "python",
6412                    "name": "VAF",
6413                    "description": "Variant Allele Frequency (VAF) harmonization",
6414                    "available": True,
6415                    "function_name": "calculation_vaf_normalization",
6416                    "function_params": [],
6417                },
6418                "VAF_stats": {
6419                    "type": "python",
6420                    "name": "VAF_stats",
6421                    "description": "Variant Allele Frequency (VAF) statistics",
6422                    "available": True,
6423                    "function_name": "calculation_genotype_stats",
6424                    "function_params": ["VAF"],
6425                },
6426                "DP_stats": {
6427                    "type": "python",
6428                    "name": "DP_stats",
6429                    "description": "Depth (DP) statistics",
6430                    "available": True,
6431                    "function_name": "calculation_genotype_stats",
6432                    "function_params": ["DP"],
6433                },
6434                "variant_id": {
6435                    "type": "python",
6436                    "name": "variant_id",
6437                    "description": "Variant ID generated from variant position and type",
6438                    "available": True,
6439                    "function_name": "calculation_variant_id",
6440                    "function_params": [],
6441                },
6442            },
6443            "prioritizations": {
6444                "default": {
6445                    "filter": [
6446                        {
6447                            "type": "notequals",
6448                            "value": "!PASS|\\.",
6449                            "score": 0,
6450                            "flag": "FILTERED",
6451                            "comment": ["Bad variant quality"],
6452                        },
6453                        {
6454                            "type": "equals",
6455                            "value": "REJECT",
6456                            "score": -20,
6457                            "flag": "PASS",
6458                            "comment": ["Bad variant quality"],
6459                        },
6460                    ],
6461                    "DP": [
6462                        {
6463                            "type": "gte",
6464                            "value": "50",
6465                            "score": 5,
6466                            "flag": "PASS",
6467                            "comment": ["DP higher than 50"],
6468                        }
6469                    ],
6470                    "ANN": [
6471                        {
6472                            "type": "contains",
6473                            "value": "HIGH",
6474                            "score": 5,
6475                            "flag": "PASS",
6476                            "comment": [
6477                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
6478                            ],
6479                        },
6480                        {
6481                            "type": "contains",
6482                            "value": "MODERATE",
6483                            "score": 3,
6484                            "flag": "PASS",
6485                            "comment": [
6486                                "A non-disruptive variant that might change protein effectiveness"
6487                            ],
6488                        },
6489                        {
6490                            "type": "contains",
6491                            "value": "LOW",
6492                            "score": 0,
6493                            "flag": "FILTERED",
6494                            "comment": [
6495                                "Assumed to be mostly harmless or unlikely to change protein behavior"
6496                            ],
6497                        },
6498                        {
6499                            "type": "contains",
6500                            "value": "MODIFIER",
6501                            "score": 0,
6502                            "flag": "FILTERED",
6503                            "comment": [
6504                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
6505                            ],
6506                        },
6507                    ],
6508                }
6509            },
6510        }
6511
6512        return config_default.get(name, None)
6513
6514    def get_config_json(
6515        self, name: str, config_dict: dict = {}, config_file: str = None
6516    ) -> dict:
6517        """
6518        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
6519        default values, a dictionary, and a file.
6520
6521        :param name: The `name` parameter in the `get_config_json` function is a string that represents
6522        the name of the configuration. It is used to identify and retrieve the configuration settings
6523        for a specific component or module
6524        :type name: str
6525        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
6526        dictionary that allows you to provide additional configuration settings or overrides. When you
6527        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
6528        the key is the configuration setting you want to override or
6529        :type config_dict: dict
6530        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
6531        specify the path to a configuration file that contains additional settings. If provided, the
6532        function will read the contents of this file and update the configuration dictionary with the
6533        values found in the file, overriding any existing values with the
6534        :type config_file: str
6535        :return: The function `get_config_json` returns a dictionary containing the configuration
6536        settings.
6537        """
6538
6539        # Create with default prioritizations
6540        config_default = self.get_config_default(name=name)
6541        configuration = config_default
6542        # log.debug(f"configuration={configuration}")
6543
6544        # Replace prioritizations from dict
6545        for config in config_dict:
6546            configuration[config] = config_dict[config]
6547
6548        # Replace prioritizations from file
6549        config_file = full_path(config_file)
6550        if config_file:
6551            if os.path.exists(config_file):
6552                with open(config_file) as config_file_content:
6553                    config_file_dict = json.load(config_file_content)
6554                for config in config_file_dict:
6555                    configuration[config] = config_file_dict[config]
6556            else:
6557                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
6558                log.error(msg_error)
6559                raise ValueError(msg_error)
6560
6561        return configuration
6562
6563    def prioritization(self) -> None:
6564        """
6565        It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other
6566        INFO fields
6567        """
6568
6569        # Config
6570        config = self.get_config()
6571
6572        # Param
6573        param = self.get_param()
6574
6575        # Quick Prioritizations
6576        # prioritizations = param.get("prioritization", {}).get("prioritizations", "")
6577
6578        # Configuration profiles
6579        prioritization_config_file = param.get("prioritization", {}).get(
6580            "prioritization_config", None
6581        )
6582        prioritization_config_file = full_path(prioritization_config_file)
6583        prioritizations_config = self.get_config_json(
6584            name="prioritizations", config_file=prioritization_config_file
6585        )
6586
6587        # Prioritization options
6588        profiles = param.get("prioritization", {}).get("profiles", [])
6589        if isinstance(profiles, str):
6590            profiles = profiles.split(",")
6591        pzfields = param.get("prioritization", {}).get(
6592            "pzfields", ["PZFlag", "PZScore"]
6593        )
6594        if isinstance(pzfields, str):
6595            pzfields = pzfields.split(",")
6596        default_profile = param.get("prioritization", {}).get("default_profile", None)
6597        pzfields_sep = param.get("prioritization", {}).get("pzfields_sep", "_")
6598        prioritization_score_mode = param.get("prioritization", {}).get(
6599            "prioritization_score_mode", "HOWARD"
6600        )
6601
6602        # Quick Prioritizations
6603        # prioritizations = param.get("prioritization", {}).get("prioritizations", None)
6604        prioritizations = param.get("prioritizations", None)
6605        if prioritizations:
6606            log.info("Quick Prioritization:")
6607            for profile in prioritizations.split(","):
6608                if profile not in profiles:
6609                    profiles.append(profile)
6610                    log.info(f"   {profile}")
6611
6612        # If profile "ALL" provided, all profiles in the config profiles
6613        if "ALL" in profiles:
6614            profiles = list(prioritizations_config.keys())
6615
6616        for profile in profiles:
6617            if prioritizations_config.get(profile, None):
6618                log.debug(f"Profile '{profile}' configured")
6619            else:
6620                msg_error = f"Profile '{profile}' NOT configured"
6621                log.error(msg_error)
6622                raise ValueError(msg_error)
6623
6624        if profiles:
6625            log.info(f"Prioritization... ")
6626        else:
6627            log.debug(f"No profile defined")
6628            return
6629
6630        if not default_profile and len(profiles):
6631            default_profile = profiles[0]
6632
6633        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
6634        log.debug("Profiles to check: " + str(list(profiles)))
6635
6636        # Variables
6637        table_variants = self.get_table_variants(clause="update")
6638
6639        # Added columns
6640        added_columns = []
6641
6642        # Create list of PZfields
6643        # List of PZFields
6644        list_of_pzfields_original = pzfields + [
6645            pzfield + pzfields_sep + profile
6646            for pzfield in pzfields
6647            for profile in profiles
6648        ]
6649        list_of_pzfields = []
6650        log.debug(f"{list_of_pzfields_original}")
6651
6652        # Remove existing PZfields to use if exists
6653        for pzfield in list_of_pzfields_original:
6654            if self.get_header().infos.get(pzfield, None) is None:
6655                list_of_pzfields.append(pzfield)
6656                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
6657            else:
6658                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
6659
6660        if list_of_pzfields:
6661
6662            # Explode Infos fields
6663            explode_infos_prefix = self.get_explode_infos_prefix()
6664            added_columns += self.explode_infos(prefix=explode_infos_prefix)
6665            extra_infos = self.get_extra_infos()
6666
6667            # PZfields tags description
6668            PZfields_INFOS = {
6669                "PZTags": {
6670                    "ID": "PZTags",
6671                    "Number": ".",
6672                    "Type": "String",
6673                    "Description": "Variant tags based on annotation criteria",
6674                },
6675                "PZScore": {
6676                    "ID": "PZScore",
6677                    "Number": 1,
6678                    "Type": "Integer",
6679                    "Description": "Variant score based on annotation criteria",
6680                },
6681                "PZFlag": {
6682                    "ID": "PZFlag",
6683                    "Number": 1,
6684                    "Type": "String",
6685                    "Description": "Variant flag based on annotation criteria",
6686                },
6687                "PZComment": {
6688                    "ID": "PZComment",
6689                    "Number": ".",
6690                    "Type": "String",
6691                    "Description": "Variant comment based on annotation criteria",
6692                },
6693                "PZInfos": {
6694                    "ID": "PZInfos",
6695                    "Number": ".",
6696                    "Type": "String",
6697                    "Description": "Variant infos based on annotation criteria",
6698                },
6699            }
6700
6701            # Create INFO fields if not exist
6702            for field in PZfields_INFOS:
6703                field_ID = PZfields_INFOS[field]["ID"]
6704                field_description = PZfields_INFOS[field]["Description"]
6705                if field_ID not in self.get_header().infos and field_ID in pzfields:
6706                    field_description = (
6707                        PZfields_INFOS[field]["Description"]
6708                        + f", profile {default_profile}"
6709                    )
6710                    self.get_header().infos[field_ID] = vcf.parser._Info(
6711                        field_ID,
6712                        PZfields_INFOS[field]["Number"],
6713                        PZfields_INFOS[field]["Type"],
6714                        field_description,
6715                        "unknown",
6716                        "unknown",
6717                        code_type_map[PZfields_INFOS[field]["Type"]],
6718                    )
6719
6720            # Create INFO fields if not exist for each profile
6721            for profile in prioritizations_config:
6722                if profile in profiles or profiles == []:
6723                    for field in PZfields_INFOS:
6724                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
6725                        field_description = (
6726                            PZfields_INFOS[field]["Description"]
6727                            + f", profile {profile}"
6728                        )
6729                        if (
6730                            field_ID not in self.get_header().infos
6731                            and field in pzfields
6732                        ):
6733                            self.get_header().infos[field_ID] = vcf.parser._Info(
6734                                field_ID,
6735                                PZfields_INFOS[field]["Number"],
6736                                PZfields_INFOS[field]["Type"],
6737                                field_description,
6738                                "unknown",
6739                                "unknown",
6740                                code_type_map[PZfields_INFOS[field]["Type"]],
6741                            )
6742
6743            # Header
6744            for pzfield in list_of_pzfields:
6745                if re.match("PZScore.*", pzfield):
6746                    added_column = self.add_column(
6747                        table_name=table_variants,
6748                        column_name=pzfield,
6749                        column_type="INTEGER",
6750                        default_value="0",
6751                    )
6752                elif re.match("PZFlag.*", pzfield):
6753                    added_column = self.add_column(
6754                        table_name=table_variants,
6755                        column_name=pzfield,
6756                        column_type="BOOLEAN",
6757                        default_value="1",
6758                    )
6759                else:
6760                    added_column = self.add_column(
6761                        table_name=table_variants,
6762                        column_name=pzfield,
6763                        column_type="STRING",
6764                        default_value="''",
6765                    )
6766                added_columns.append(added_column)
6767
6768            # Profiles
6769            if profiles:
6770
6771                # foreach profile in configuration file
6772                for profile in prioritizations_config:
6773
6774                    # If profile is asked in param, or ALL are asked (empty profile [])
6775                    if profile in profiles or profiles == []:
6776                        log.info(f"Profile '{profile}'")
6777
6778                        sql_set_info_option = ""
6779
6780                        sql_set_info = []
6781
6782                        # PZ fields set
6783
6784                        # PZScore
6785                        if f"PZScore{pzfields_sep}{profile}" in list_of_pzfields:
6786                            sql_set_info.append(
6787                                f"""
6788                                    concat(
6789                                        'PZScore{pzfields_sep}{profile}=',
6790                                        PZScore{pzfields_sep}{profile}
6791                                    ) 
6792                                """
6793                            )
6794                            if (
6795                                profile == default_profile
6796                                and "PZScore" in list_of_pzfields
6797                            ):
6798                                sql_set_info.append(
6799                                    f"""
6800                                        concat(
6801                                            'PZScore=',
6802                                            PZScore{pzfields_sep}{profile}
6803                                        )
6804                                    """
6805                                )
6806
6807                        # PZFlag
6808                        if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields:
6809                            sql_set_info.append(
6810                                f"""
6811                                    concat(
6812                                        'PZFlag{pzfields_sep}{profile}=',
6813                                        CASE 
6814                                            WHEN PZFlag{pzfields_sep}{profile}==1
6815                                            THEN 'PASS'
6816                                            WHEN PZFlag{pzfields_sep}{profile}==0
6817                                            THEN 'FILTERED'
6818                                        END
6819                                    ) 
6820                                """
6821                            )
6822                            if (
6823                                profile == default_profile
6824                                and "PZFlag" in list_of_pzfields
6825                            ):
6826                                sql_set_info.append(
6827                                    f"""
6828                                        concat(
6829                                            'PZFlag=',
6830                                            CASE 
6831                                                WHEN PZFlag{pzfields_sep}{profile}==1
6832                                                THEN 'PASS'
6833                                                WHEN PZFlag{pzfields_sep}{profile}==0
6834                                                THEN 'FILTERED'
6835                                            END
6836                                        )
6837                                    """
6838                                )
6839
6840                        # PZComment
6841                        if f"PZComment{pzfields_sep}{profile}" in list_of_pzfields:
6842                            sql_set_info.append(
6843                                f"""
6844                                    CASE
6845                                        WHEN PZComment{pzfields_sep}{profile} NOT IN ('')
6846                                        THEN concat('PZComment{pzfields_sep}{profile}=', PZComment{pzfields_sep}{profile})
6847                                        ELSE ''
6848                                    END
6849                                """
6850                            )
6851                            if (
6852                                profile == default_profile
6853                                and "PZComment" in list_of_pzfields
6854                            ):
6855                                sql_set_info.append(
6856                                    f"""
6857                                        CASE
6858                                            WHEN PZComment{pzfields_sep}{profile} NOT IN ('')
6859                                            THEN concat('PZComment=', PZComment{pzfields_sep}{profile})
6860                                            ELSE ''
6861                                        END
6862                                    """
6863                                )
6864
6865                        # PZInfos
6866                        if f"PZInfos{pzfields_sep}{profile}" in list_of_pzfields:
6867                            sql_set_info.append(
6868                                f"""
6869                                    CASE
6870                                        WHEN PZInfos{pzfields_sep}{profile} NOT IN ('')
6871                                        THEN concat('PZInfos{pzfields_sep}{profile}=', PZInfos{pzfields_sep}{profile})
6872                                        ELSE ''
6873                                    END
6874                                """
6875                            )
6876                            if (
6877                                profile == default_profile
6878                                and "PZInfos" in list_of_pzfields
6879                            ):
6880                                sql_set_info.append(
6881                                    f"""
6882                                        CASE
6883                                            WHEN PZInfos{pzfields_sep}{profile} NOT IN ('')
6884                                            THEN concat('PZInfos=', PZInfos{pzfields_sep}{profile})
6885                                            ELSE ''
6886                                        END
6887                                    """
6888                                )
6889
6890                        # Merge PZfields
6891                        sql_set_info_option = ""
6892                        sql_set_sep = ""
6893                        for sql_set in sql_set_info:
6894                            if sql_set_sep:
6895                                sql_set_info_option += f"""
6896                                    , concat('{sql_set_sep}', {sql_set})
6897                                """
6898                            else:
6899                                sql_set_info_option += f"""
6900                                    , {sql_set}
6901                                """
6902                            sql_set_sep = ";"
6903
6904                        sql_queries = []
6905                        for annotation in prioritizations_config[profile]:
6906
6907                            # Check if annotation field is present
6908                            if not f"{explode_infos_prefix}{annotation}" in extra_infos:
6909                                log.debug(f"Annotation '{annotation}' not in data")
6910                                continue
6911                            else:
6912                                log.debug(f"Annotation '{annotation}' in data")
6913
6914                            # For each criterions
6915                            for criterion in prioritizations_config[profile][
6916                                annotation
6917                            ]:
6918                                criterion_type = criterion["type"]
6919                                criterion_value = criterion["value"]
6920                                criterion_score = criterion.get("score", 0)
6921                                criterion_flag = criterion.get("flag", "PASS")
6922                                criterion_flag_bool = criterion_flag == "PASS"
6923                                criterion_comment = (
6924                                    ", ".join(criterion.get("comment", []))
6925                                    .replace("'", "''")
6926                                    .replace(";", ",")
6927                                    .replace("\t", " ")
6928                                )
6929                                criterion_infos = (
6930                                    str(criterion)
6931                                    .replace("'", "''")
6932                                    .replace(";", ",")
6933                                    .replace("\t", " ")
6934                                )
6935
6936                                sql_set = []
6937                                sql_set_info = []
6938
6939                                # PZ fields set
6940                                if (
6941                                    f"PZScore{pzfields_sep}{profile}"
6942                                    in list_of_pzfields
6943                                ):
6944                                    if prioritization_score_mode == "HOWARD":
6945                                        sql_set.append(
6946                                            f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}"
6947                                        )
6948                                    elif prioritization_score_mode == "VaRank":
6949                                        sql_set.append(
6950                                            f"PZScore{pzfields_sep}{profile} = CASE WHEN {criterion_score}>PZScore{pzfields_sep}{profile} THEN {criterion_score} END"
6951                                        )
6952                                    else:
6953                                        sql_set.append(
6954                                            f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}"
6955                                        )
6956                                if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields:
6957                                    sql_set.append(
6958                                        f"PZFlag{pzfields_sep}{profile} = PZFlag{pzfields_sep}{profile} AND {criterion_flag_bool}"
6959                                    )
6960                                if (
6961                                    f"PZComment{pzfields_sep}{profile}"
6962                                    in list_of_pzfields
6963                                ):
6964                                    sql_set.append(
6965                                        f"""
6966                                            PZComment{pzfields_sep}{profile} = 
6967                                                concat(
6968                                                    PZComment{pzfields_sep}{profile},
6969                                                    CASE 
6970                                                        WHEN PZComment{pzfields_sep}{profile}!=''
6971                                                        THEN ', '
6972                                                        ELSE ''
6973                                                    END,
6974                                                    '{criterion_comment}'
6975                                                )
6976                                        """
6977                                    )
6978                                if (
6979                                    f"PZInfos{pzfields_sep}{profile}"
6980                                    in list_of_pzfields
6981                                ):
6982                                    sql_set.append(
6983                                        f"""
6984                                            PZInfos{pzfields_sep}{profile} = 
6985                                                concat(
6986                                                    PZInfos{pzfields_sep}{profile},
6987                                                    '{criterion_infos}'
6988                                                )
6989                                        """
6990                                    )
6991                                sql_set_option = ",".join(sql_set)
6992
6993                                # Criterion and comparison
6994                                try:
6995                                    float(criterion_value)
6996                                    sql_update = f"""
6997                                        UPDATE {table_variants}
6998                                        SET {sql_set_option}
6999                                        WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
7000                                        AND "{explode_infos_prefix}{annotation}"{comparison_map[criterion_type]}{criterion_value}
7001                                        """
7002                                except:
7003                                    contains_option = ""
7004                                    if criterion_type == "contains":
7005                                        contains_option = ".*"
7006                                    sql_update = f"""
7007                                        UPDATE {table_variants}
7008                                        SET {sql_set_option}
7009                                        WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
7010                                        """
7011                                sql_queries.append(sql_update)
7012
7013                        # PZTags
7014                        if f"PZTags{pzfields_sep}{profile}" in list_of_pzfields:
7015
7016                            # Create PZFalgs value
7017                            pztags_value = ""
7018                            pztags_sep_default = "|"
7019                            pztags_sep = ""
7020                            for pzfield in pzfields:
7021                                if pzfield not in ["PZTags"]:
7022                                    if (
7023                                        f"{pzfield}{pzfields_sep}{profile}"
7024                                        in list_of_pzfields
7025                                    ):
7026                                        if pzfield in ["PZFlag"]:
7027                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7028                                                CASE WHEN PZFlag{pzfields_sep}{profile}
7029                                                    THEN 'PASS'
7030                                                    ELSE 'FILTERED'
7031                                                END, '"""
7032                                        else:
7033                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
7034                                        pztags_sep = pztags_sep_default
7035
7036                            # Add Query update for PZFlags
7037                            sql_update_pztags = f"""
7038                                UPDATE {table_variants}
7039                                SET INFO = concat(
7040                                        INFO,
7041                                        CASE WHEN INFO NOT in ('','.')
7042                                                THEN ';'
7043                                                ELSE ''
7044                                        END,
7045                                        'PZTags{pzfields_sep}{profile}={pztags_value}'
7046                                    )
7047                                """
7048                            sql_queries.append(sql_update_pztags)
7049
7050                            # Add Query update for PZFlags for default
7051                            if profile == default_profile:
7052                                sql_update_pztags_default = f"""
7053                                UPDATE {table_variants}
7054                                SET INFO = concat(
7055                                        INFO,
7056                                        ';',
7057                                        'PZTags={pztags_value}'
7058                                    )
7059                                """
7060                                sql_queries.append(sql_update_pztags_default)
7061
7062                        log.info(f"""Profile '{profile}' - Prioritization... """)
7063
7064                        if sql_queries:
7065
7066                            for sql_query in sql_queries:
7067                                log.debug(
7068                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
7069                                )
7070                                self.conn.execute(sql_query)
7071
7072                        log.info(f"""Profile '{profile}' - Update... """)
7073                        sql_query_update = f"""
7074                            UPDATE {table_variants}
7075                            SET INFO =  
7076                                concat(
7077                                    CASE
7078                                        WHEN INFO NOT IN ('','.')
7079                                        THEN concat(INFO, ';')
7080                                        ELSE ''
7081                                    END
7082                                    {sql_set_info_option}
7083                                )
7084                        """
7085                        self.conn.execute(sql_query_update)
7086
7087        else:
7088
7089            log.warning(f"No profiles in parameters")
7090
7091        # Remove added columns
7092        for added_column in added_columns:
7093            self.drop_column(column=added_column)
7094
7095        # Explode INFOS fields into table fields
7096        if self.get_explode_infos():
7097            self.explode_infos(
7098                prefix=self.get_explode_infos_prefix(),
7099                fields=self.get_explode_infos_fields(),
7100                force=True,
7101            )
7102
7103        return
7104
7105    ###
7106    # HGVS
7107    ###
7108
7109    def annotation_hgvs(self, threads: int = None) -> None:
7110        """
7111        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
7112        coordinates and alleles.
7113
7114        :param threads: The `threads` parameter is an optional integer that specifies the number of
7115        threads to use for parallel processing. If no value is provided, it will default to the number
7116        of threads obtained from the `get_threads()` method
7117        :type threads: int
7118        """
7119
7120        # Function for each partition of the Dask Dataframe
7121        def partition_function(partition):
7122            """
7123            The function `partition_function` applies the `annotation_hgvs_partition` function to
7124            each row of a DataFrame called `partition`.
7125
7126            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
7127            to be processed
7128            :return: the result of applying the "annotation_hgvs_partition" function to each row of
7129            the "partition" dataframe along the axis 1.
7130            """
7131            return partition.apply(annotation_hgvs_partition, axis=1)
7132
7133        def annotation_hgvs_partition(row) -> str:
7134            """
7135            The function `annotation_hgvs_partition` takes in a row of data and returns a string
7136            containing a list of HGVS names associated with the given genomic coordinates and alleles.
7137
7138            :param row: A dictionary-like object that contains the values for the following keys:
7139            :return: a string that contains the HGVS names associated with the given row of data.
7140            """
7141
7142            chr = row["CHROM"]
7143            pos = row["POS"]
7144            ref = row["REF"]
7145            alt = row["ALT"]
7146
7147            # Find list of associated transcripts
7148            transcripts_list = list(
7149                polars_conn.execute(
7150                    f"""
7151                SELECT transcript
7152                FROM refseq_df
7153                WHERE CHROM='{chr}'
7154                AND POS={pos}
7155            """
7156                )["transcript"]
7157            )
7158
7159            # Full HGVS annotation in list
7160            hgvs_full_list = []
7161
7162            for transcript_name in transcripts_list:
7163
7164                # Transcript
7165                transcript = get_transcript(
7166                    transcripts=transcripts, transcript_name=transcript_name
7167                )
7168                # Exon
7169                if use_exon:
7170                    exon = transcript.find_exon_number(pos)
7171                else:
7172                    exon = None
7173                # Protein
7174                transcript_protein = None
7175                if use_protein or add_protein or full_format:
7176                    transcripts_protein = list(
7177                        polars_conn.execute(
7178                            f"""
7179                        SELECT protein
7180                        FROM refseqlink_df
7181                        WHERE transcript='{transcript_name}'
7182                        LIMIT 1
7183                    """
7184                        )["protein"]
7185                    )
7186                    if len(transcripts_protein):
7187                        transcript_protein = transcripts_protein[0]
7188
7189                # HGVS name
7190                hgvs_name = format_hgvs_name(
7191                    chr,
7192                    pos,
7193                    ref,
7194                    alt,
7195                    genome=genome,
7196                    transcript=transcript,
7197                    transcript_protein=transcript_protein,
7198                    exon=exon,
7199                    use_gene=use_gene,
7200                    use_protein=use_protein,
7201                    full_format=full_format,
7202                    use_version=use_version,
7203                    codon_type=codon_type,
7204                )
7205                hgvs_full_list.append(hgvs_name)
7206                if add_protein and not use_protein and not full_format:
7207                    hgvs_name = format_hgvs_name(
7208                        chr,
7209                        pos,
7210                        ref,
7211                        alt,
7212                        genome=genome,
7213                        transcript=transcript,
7214                        transcript_protein=transcript_protein,
7215                        exon=exon,
7216                        use_gene=use_gene,
7217                        use_protein=True,
7218                        full_format=False,
7219                        use_version=use_version,
7220                        codon_type=codon_type,
7221                    )
7222                    hgvs_full_list.append(hgvs_name)
7223
7224            # Create liste of HGVS annotations
7225            hgvs_full = ",".join(hgvs_full_list)
7226
7227            return hgvs_full
7228
7229        # Polars connexion
7230        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7231
7232        # Config
7233        config = self.get_config()
7234
7235        # Databases
7236        # Genome
7237        databases_genomes_folders = (
7238            config.get("folders", {})
7239            .get("databases", {})
7240            .get("genomes", DEFAULT_GENOME_FOLDER)
7241        )
7242        databases_genome = (
7243            config.get("folders", {}).get("databases", {}).get("genomes", "")
7244        )
7245        # refseq database folder
7246        databases_refseq_folders = (
7247            config.get("folders", {})
7248            .get("databases", {})
7249            .get("refseq", DEFAULT_REFSEQ_FOLDER)
7250        )
7251        # refseq
7252        databases_refseq = config.get("databases", {}).get("refSeq", None)
7253        # refSeqLink
7254        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
7255
7256        # Param
7257        param = self.get_param()
7258
7259        # Quick HGVS
7260        if "hgvs_options" in param and param.get("hgvs_options", ""):
7261            log.info(f"Quick HGVS Annotation:")
7262            if not param.get("hgvs", None):
7263                param["hgvs"] = {}
7264            for option in param.get("hgvs_options", "").split(","):
7265                option_var_val = option.split("=")
7266                option_var = option_var_val[0]
7267                if len(option_var_val) > 1:
7268                    option_val = option_var_val[1]
7269                else:
7270                    option_val = "True"
7271                if option_val.upper() in ["TRUE"]:
7272                    option_val = True
7273                elif option_val.upper() in ["FALSE"]:
7274                    option_val = False
7275                log.info(f"   {option_var}={option_val}")
7276                param["hgvs"][option_var] = option_val
7277
7278        # Check if HGVS annotation enabled
7279        if "hgvs" in param:
7280            log.info(f"HGVS Annotation... ")
7281            for hgvs_option in param.get("hgvs", {}):
7282                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
7283        else:
7284            return
7285
7286        # HGVS Param
7287        param_hgvs = param.get("hgvs", {})
7288        use_exon = param_hgvs.get("use_exon", False)
7289        use_gene = param_hgvs.get("use_gene", False)
7290        use_protein = param_hgvs.get("use_protein", False)
7291        add_protein = param_hgvs.get("add_protein", False)
7292        full_format = param_hgvs.get("full_format", False)
7293        use_version = param_hgvs.get("use_version", False)
7294        codon_type = param_hgvs.get("codon_type", "3")
7295
7296        # refSseq refSeqLink
7297        databases_refseq = param_hgvs.get("refseq", databases_refseq)
7298        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
7299
7300        # Assembly
7301        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
7302
7303        # Genome
7304        genome_file = None
7305        if find_genome(databases_genome):
7306            genome_file = find_genome(databases_genome)
7307        else:
7308            genome_file = find_genome(
7309                genome_path=databases_genomes_folders, assembly=assembly
7310            )
7311        log.debug("Genome: " + str(genome_file))
7312
7313        # refSseq
7314        refseq_file = find_file_prefix(
7315            input_file=databases_refseq,
7316            prefix="ncbiRefSeq",
7317            folder=databases_refseq_folders,
7318            assembly=assembly,
7319        )
7320        log.debug("refSeq: " + str(refseq_file))
7321
7322        # refSeqLink
7323        refseqlink_file = find_file_prefix(
7324            input_file=databases_refseqlink,
7325            prefix="ncbiRefSeqLink",
7326            folder=databases_refseq_folders,
7327            assembly=assembly,
7328        )
7329        log.debug("refSeqLink: " + str(refseqlink_file))
7330
7331        # Threads
7332        if not threads:
7333            threads = self.get_threads()
7334        log.debug("Threads: " + str(threads))
7335
7336        # Variables
7337        table_variants = self.get_table_variants(clause="update")
7338
7339        # Get variants SNV and InDel only
7340        query_variants = f"""
7341            SELECT "#CHROM" AS CHROM, POS, REF, ALT
7342            FROM {table_variants}
7343            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
7344            """
7345        df_variants = self.get_query_to_df(query_variants)
7346
7347        # Added columns
7348        added_columns = []
7349
7350        # Add hgvs column in variants table
7351        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
7352        added_column = self.add_column(
7353            table_variants, hgvs_column_name, "STRING", default_value=None
7354        )
7355        added_columns.append(added_column)
7356
7357        log.debug(f"refSeq loading...")
7358        # refSeq in duckDB
7359        refseq_table = get_refseq_table(
7360            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
7361        )
7362        # Loading all refSeq in Dataframe
7363        refseq_query = f"""
7364            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
7365            FROM {refseq_table}
7366            JOIN df_variants ON (
7367                {refseq_table}.chrom = df_variants.CHROM
7368                AND {refseq_table}.txStart<=df_variants.POS
7369                AND {refseq_table}.txEnd>=df_variants.POS
7370            )
7371        """
7372        refseq_df = self.conn.query(refseq_query).pl()
7373
7374        if refseqlink_file:
7375            log.debug(f"refSeqLink loading...")
7376            # refSeqLink in duckDB
7377            refseqlink_table = get_refseq_table(
7378                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
7379            )
7380            # Loading all refSeqLink in Dataframe
7381            protacc_column = "protAcc_with_ver"
7382            mrnaacc_column = "mrnaAcc_with_ver"
7383            refseqlink_query = f"""
7384                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
7385                FROM {refseqlink_table} 
7386                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
7387                WHERE protAcc_without_ver IS NOT NULL
7388            """
7389            # Polars Dataframe
7390            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
7391
7392        # Read RefSeq transcripts into a python dict/model.
7393        log.debug(f"Transcripts loading...")
7394        with tempfile.TemporaryDirectory() as tmpdir:
7395            transcripts_query = f"""
7396                COPY (
7397                    SELECT {refseq_table}.*
7398                    FROM {refseq_table}
7399                    JOIN df_variants ON (
7400                        {refseq_table}.chrom=df_variants.CHROM
7401                        AND {refseq_table}.txStart<=df_variants.POS
7402                        AND {refseq_table}.txEnd>=df_variants.POS
7403                    )
7404                )
7405                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
7406            """
7407            self.conn.query(transcripts_query)
7408            with open(f"{tmpdir}/transcript.tsv") as infile:
7409                transcripts = read_transcripts(infile)
7410
7411        # Polars connexion
7412        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7413
7414        log.debug("Genome loading...")
7415        # Read genome sequence using pyfaidx.
7416        genome = Fasta(genome_file)
7417
7418        log.debug("Start annotation HGVS...")
7419
7420        # Create
7421        # a Dask Dataframe from Pandas dataframe with partition as number of threads
7422        ddf = dd.from_pandas(df_variants, npartitions=threads)
7423
7424        # Use dask.dataframe.apply() to apply function on each partition
7425        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
7426
7427        # Convert Dask DataFrame to Pandas Dataframe
7428        df = ddf.compute()
7429
7430        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
7431        with tempfile.TemporaryDirectory() as tmpdir:
7432            df_parquet = os.path.join(tmpdir, "df.parquet")
7433            df.to_parquet(df_parquet)
7434
7435            # Update hgvs column
7436            update_variant_query = f"""
7437                UPDATE {table_variants}
7438                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
7439                FROM read_parquet('{df_parquet}') as df
7440                WHERE variants."#CHROM" = df.CHROM
7441                AND variants.POS = df.POS
7442                AND variants.REF = df.REF
7443                AND variants.ALT = df.ALT
7444                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
7445                """
7446            self.execute_query(update_variant_query)
7447
7448        # Update INFO column
7449        sql_query_update = f"""
7450            UPDATE {table_variants}
7451            SET INFO = 
7452                concat(
7453                    CASE 
7454                        WHEN INFO NOT IN ('','.')
7455                        THEN concat(INFO, ';')
7456                        ELSE ''
7457                    END,
7458                    'hgvs=',
7459                    {hgvs_column_name}
7460                )
7461            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
7462            """
7463        self.execute_query(sql_query_update)
7464
7465        # Add header
7466        HGVS_INFOS = {
7467            "hgvs": {
7468                "ID": "hgvs",
7469                "Number": ".",
7470                "Type": "String",
7471                "Description": f"HGVS annotatation with HOWARD",
7472            }
7473        }
7474
7475        for field in HGVS_INFOS:
7476            field_ID = HGVS_INFOS[field]["ID"]
7477            field_description = HGVS_INFOS[field]["Description"]
7478            self.get_header().infos[field_ID] = vcf.parser._Info(
7479                field_ID,
7480                HGVS_INFOS[field]["Number"],
7481                HGVS_INFOS[field]["Type"],
7482                field_description,
7483                "unknown",
7484                "unknown",
7485                code_type_map[HGVS_INFOS[field]["Type"]],
7486            )
7487
7488        # Remove added columns
7489        for added_column in added_columns:
7490            self.drop_column(column=added_column)
7491
7492    ###
7493    # Calculation
7494    ###
7495
7496    def get_operations_help(
7497        self, operations_config_dict: dict = {}, operations_config_file: str = None
7498    ) -> list:
7499
7500        # Init
7501        operations_help = []
7502
7503        # operations
7504        operations = self.get_config_json(
7505            name="calculations",
7506            config_dict=operations_config_dict,
7507            config_file=operations_config_file,
7508        )
7509        for op in operations:
7510            op_name = operations[op].get("name", op).upper()
7511            op_description = operations[op].get("description", op_name)
7512            op_available = operations[op].get("available", False)
7513            if op_available:
7514                operations_help.append(f"   {op_name}: {op_description}")
7515
7516        # Sort operations
7517        operations_help.sort()
7518
7519        # insert header
7520        operations_help.insert(0, "Available calculation operations:")
7521
7522        # Return
7523        return operations_help
7524
7525    def calculation(
7526        self,
7527        operations: dict = {},
7528        operations_config_dict: dict = {},
7529        operations_config_file: str = None,
7530    ) -> None:
7531        """
7532        It takes a list of operations, and for each operation, it checks if it's a python or sql
7533        operation, and then calls the appropriate function
7534
7535        param json example:
7536            "calculation": {
7537                "NOMEN": {
7538                    "options": {
7539                        "hgvs_field": "hgvs"
7540                    },
7541                "middle" : null
7542            }
7543        """
7544
7545        # Param
7546        param = self.get_param()
7547
7548        # operations config
7549        operations_config = self.get_config_json(
7550            name="calculations",
7551            config_dict=operations_config_dict,
7552            config_file=operations_config_file,
7553        )
7554
7555        # Upper keys
7556        operations_config = {k.upper(): v for k, v in operations_config.items()}
7557
7558        # Calculations
7559
7560        # Operations from param
7561        operations = param.get("calculation", {}).get("calculations", operations)
7562
7563        # Quick calculation - add
7564        if param.get("calculations", None):
7565            calculations_list = [
7566                value for value in param.get("calculations", "").split(",")
7567            ]
7568            log.info(f"Quick Calculations:")
7569            for calculation_key in calculations_list:
7570                log.info(f"   {calculation_key}")
7571            for calculation_operation in calculations_list:
7572                if calculation_operation.upper() not in operations:
7573                    operations[calculation_operation.upper()] = {}
7574                    add_value_into_dict(
7575                        dict_tree=param,
7576                        sections=[
7577                            "calculation",
7578                            "calculations",
7579                            calculation_operation.upper(),
7580                        ],
7581                        value={},
7582                    )
7583
7584        # Operations for calculation
7585        if not operations:
7586            operations = param.get("calculation", {}).get("calculations", {})
7587
7588        if operations:
7589            log.info(f"Calculations...")
7590
7591        # For each operations
7592        for operation_name in operations:
7593            operation_name = operation_name.upper()
7594            if operation_name not in [""]:
7595                if operation_name in operations_config:
7596                    log.info(f"Calculation '{operation_name}'")
7597                    operation = operations_config[operation_name]
7598                    operation_type = operation.get("type", "sql")
7599                    if operation_type == "python":
7600                        self.calculation_process_function(
7601                            operation=operation, operation_name=operation_name
7602                        )
7603                    elif operation_type == "sql":
7604                        self.calculation_process_sql(
7605                            operation=operation, operation_name=operation_name
7606                        )
7607                    else:
7608                        log.error(
7609                            f"Operations config: Type '{operation_type}' NOT available"
7610                        )
7611                        raise ValueError(
7612                            f"Operations config: Type '{operation_type}' NOT available"
7613                        )
7614                else:
7615                    log.error(
7616                        f"Operations config: Calculation '{operation_name}' NOT available"
7617                    )
7618                    raise ValueError(
7619                        f"Operations config: Calculation '{operation_name}' NOT available"
7620                    )
7621
7622        # Explode INFOS fields into table fields
7623        if self.get_explode_infos():
7624            self.explode_infos(
7625                prefix=self.get_explode_infos_prefix(),
7626                fields=self.get_explode_infos_fields(),
7627                force=True,
7628            )
7629
7630    def calculation_process_sql(
7631        self, operation: dict, operation_name: str = "unknown"
7632    ) -> None:
7633        """
7634        The `calculation_process_sql` function takes in a mathematical operation as a string and
7635        performs the operation, updating the specified table with the result.
7636
7637        :param operation: The `operation` parameter is a dictionary that contains information about the
7638        mathematical operation to be performed. It includes the following keys:
7639        :type operation: dict
7640        :param operation_name: The `operation_name` parameter is a string that represents the name of
7641        the mathematical operation being performed. It is used for logging and error handling purposes,
7642        defaults to unknown
7643        :type operation_name: str (optional)
7644        """
7645
7646        # table variants
7647        table_variants = self.get_table_variants(clause="alter")
7648
7649        # Operation infos
7650        operation_name = operation.get("name", "unknown")
7651        log.debug(f"process sql {operation_name}")
7652        output_column_name = operation.get("output_column_name", operation_name)
7653        output_column_type = operation.get("output_column_type", "String")
7654        prefix = operation.get("explode_infos_prefix", "")
7655        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
7656        output_column_description = operation.get(
7657            "output_column_description", f"{operation_name} operation"
7658        )
7659        operation_query = operation.get("operation_query", None)
7660        if isinstance(operation_query, list):
7661            operation_query = " ".join(operation_query)
7662        operation_info_fields = operation.get("info_fields", [])
7663        operation_info_fields_check = operation.get("info_fields_check", False)
7664        operation_info = operation.get("operation_info", True)
7665
7666        if operation_query:
7667
7668            # Info fields check
7669            operation_info_fields_check_result = True
7670            if operation_info_fields_check:
7671                header_infos = self.get_header().infos
7672                for info_field in operation_info_fields:
7673                    operation_info_fields_check_result = (
7674                        operation_info_fields_check_result
7675                        and info_field in header_infos
7676                    )
7677
7678            # If info fields available
7679            if operation_info_fields_check_result:
7680
7681                # Added_columns
7682                added_columns = []
7683
7684                # Create VCF header field
7685                vcf_reader = self.get_header()
7686                vcf_reader.infos[output_column_name] = vcf.parser._Info(
7687                    output_column_name,
7688                    ".",
7689                    output_column_type,
7690                    output_column_description,
7691                    "howard calculation",
7692                    "0",
7693                    self.code_type_map.get(output_column_type),
7694                )
7695
7696                # Explode infos if needed
7697                log.debug(f"calculation_process_sql prefix {prefix}")
7698                added_columns += self.explode_infos(
7699                    prefix=prefix,
7700                    fields=[output_column_name] + operation_info_fields,
7701                    force=True,
7702                )
7703
7704                # Create column
7705                added_column = self.add_column(
7706                    table_name=table_variants,
7707                    column_name=prefix + output_column_name,
7708                    column_type=output_column_type_sql,
7709                    default_value="null",
7710                )
7711                added_columns.append(added_column)
7712
7713                # Operation calculation
7714                try:
7715
7716                    # Query to update calculation column
7717                    sql_update = f"""
7718                        UPDATE {table_variants}
7719                        SET "{prefix}{output_column_name}" = ({operation_query})
7720                    """
7721                    self.conn.execute(sql_update)
7722
7723                    # Add to INFO
7724                    if operation_info:
7725                        sql_update_info = f"""
7726                            UPDATE {table_variants}
7727                            SET "INFO" =
7728                                concat(
7729                                    CASE
7730                                        WHEN "INFO" IS NOT NULL
7731                                        THEN concat("INFO", ';')
7732                                        ELSE ''
7733                                    END,
7734                                    '{output_column_name}=',
7735                                    "{prefix}{output_column_name}"
7736                                )
7737                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
7738                        """
7739                        self.conn.execute(sql_update_info)
7740
7741                except:
7742                    log.error(
7743                        f"Operations config: Calculation '{operation_name}' query failed"
7744                    )
7745                    raise ValueError(
7746                        f"Operations config: Calculation '{operation_name}' query failed"
7747                    )
7748
7749                # Remove added columns
7750                for added_column in added_columns:
7751                    log.debug(f"added_column: {added_column}")
7752                    self.drop_column(column=added_column)
7753
7754            else:
7755                log.error(
7756                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7757                )
7758                raise ValueError(
7759                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7760                )
7761
7762        else:
7763            log.error(
7764                f"Operations config: Calculation '{operation_name}' query NOT defined"
7765            )
7766            raise ValueError(
7767                f"Operations config: Calculation '{operation_name}' query NOT defined"
7768            )
7769
7770    def calculation_process_function(
7771        self, operation: dict, operation_name: str = "unknown"
7772    ) -> None:
7773        """
7774        The `calculation_process_function` takes in an operation dictionary and performs the specified
7775        function with the given parameters.
7776
7777        :param operation: The `operation` parameter is a dictionary that contains information about the
7778        operation to be performed. It has the following keys:
7779        :type operation: dict
7780        :param operation_name: The `operation_name` parameter is a string that represents the name of
7781        the operation being performed. It is used for logging purposes, defaults to unknown
7782        :type operation_name: str (optional)
7783        """
7784
7785        operation_name = operation["name"]
7786        log.debug(f"process sql {operation_name}")
7787        function_name = operation["function_name"]
7788        function_params = operation["function_params"]
7789        getattr(self, function_name)(*function_params)
7790
7791    def calculation_variant_id(self) -> None:
7792        """
7793        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
7794        updates the INFO field of a variants table with the variant ID.
7795        """
7796
7797        # variant_id annotation field
7798        variant_id_tag = self.get_variant_id_column()
7799        added_columns = [variant_id_tag]
7800
7801        # variant_id hgvs tags"
7802        vcf_infos_tags = {
7803            variant_id_tag: "howard variant ID annotation",
7804        }
7805
7806        # Variants table
7807        table_variants = self.get_table_variants()
7808
7809        # Header
7810        vcf_reader = self.get_header()
7811
7812        # Add variant_id to header
7813        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
7814            variant_id_tag,
7815            ".",
7816            "String",
7817            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
7818            "howard calculation",
7819            "0",
7820            self.code_type_map.get("String"),
7821        )
7822
7823        # Update
7824        sql_update = f"""
7825            UPDATE {table_variants}
7826            SET "INFO" = 
7827                concat(
7828                    CASE
7829                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
7830                        THEN ''
7831                        ELSE concat("INFO", ';')
7832                    END,
7833                    '{variant_id_tag}=',
7834                    "{variant_id_tag}"
7835                )
7836        """
7837        self.conn.execute(sql_update)
7838
7839        # Remove added columns
7840        for added_column in added_columns:
7841            self.drop_column(column=added_column)
7842
7843    def calculation_extract_snpeff_hgvs(
7844        self,
7845        snpeff_hgvs: str = "snpeff_hgvs",
7846        snpeff_field: str = "ANN",
7847    ) -> None:
7848        """
7849        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
7850        annotation field in a VCF file and adds them as a new column in the variants table.
7851
7852        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
7853        function is used to specify the name of the column that will store the HGVS nomenclatures
7854        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
7855        snpeff_hgvs
7856        :type snpeff_hgvs: str (optional)
7857        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
7858        function represents the field in the VCF file that contains SnpEff annotations. This field is
7859        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
7860        to ANN
7861        :type snpeff_field: str (optional)
7862        """
7863
7864        # Snpeff hgvs tags
7865        vcf_infos_tags = {
7866            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
7867        }
7868
7869        # Prefix
7870        prefix = self.get_explode_infos_prefix()
7871        if prefix:
7872            prefix = "INFO/"
7873
7874        # snpEff fields
7875        speff_ann_infos = prefix + snpeff_field
7876        speff_hgvs_infos = prefix + snpeff_hgvs
7877
7878        # Variants table
7879        table_variants = self.get_table_variants()
7880
7881        # Header
7882        vcf_reader = self.get_header()
7883
7884        # Add columns
7885        added_columns = []
7886
7887        # Explode HGVS field in column
7888        added_columns += self.explode_infos(fields=[snpeff_field])
7889
7890        if snpeff_field in vcf_reader.infos:
7891
7892            log.debug(vcf_reader.infos[snpeff_field])
7893
7894            # Extract ANN header
7895            ann_description = vcf_reader.infos[snpeff_field].desc
7896            pattern = r"'(.+?)'"
7897            match = re.search(pattern, ann_description)
7898            if match:
7899                ann_header_match = match.group(1).split(" | ")
7900                ann_header_desc = {}
7901                for i in range(len(ann_header_match)):
7902                    ann_header_info = "".join(
7903                        char for char in ann_header_match[i] if char.isalnum()
7904                    )
7905                    ann_header_desc[ann_header_info] = ann_header_match[i]
7906                if not ann_header_desc:
7907                    raise ValueError("Invalid header description format")
7908            else:
7909                raise ValueError("Invalid header description format")
7910
7911            # Create variant id
7912            variant_id_column = self.get_variant_id_column()
7913            added_columns += [variant_id_column]
7914
7915            # Create dataframe
7916            dataframe_snpeff_hgvs = self.get_query_to_df(
7917                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
7918            )
7919
7920            # Create main NOMEN column
7921            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
7922                speff_ann_infos
7923            ].apply(
7924                lambda x: extract_snpeff_hgvs(
7925                    str(x), header=list(ann_header_desc.values())
7926                )
7927            )
7928
7929            # Add snpeff_hgvs to header
7930            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
7931                snpeff_hgvs,
7932                ".",
7933                "String",
7934                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
7935                "howard calculation",
7936                "0",
7937                self.code_type_map.get("String"),
7938            )
7939
7940            # Update
7941            sql_update = f"""
7942                UPDATE variants
7943                SET "INFO" = 
7944                    concat(
7945                        CASE
7946                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
7947                            THEN ''
7948                            ELSE concat("INFO", ';')
7949                        END,
7950                        CASE 
7951                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
7952                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
7953                            THEN concat(
7954                                    '{snpeff_hgvs}=',
7955                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
7956                                )
7957                            ELSE ''
7958                        END
7959                    )
7960                FROM dataframe_snpeff_hgvs
7961                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
7962
7963            """
7964            self.conn.execute(sql_update)
7965
7966            # Delete dataframe
7967            del dataframe_snpeff_hgvs
7968            gc.collect()
7969
7970        else:
7971
7972            log.warning(
7973                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
7974            )
7975
7976        # Remove added columns
7977        for added_column in added_columns:
7978            self.drop_column(column=added_column)
7979
7980    def calculation_snpeff_ann_explode(
7981        self,
7982        uniquify: bool = True,
7983        output_format: str = "fields",
7984        output_prefix: str = "snpeff_",
7985        snpeff_field: str = "ANN",
7986    ) -> None:
7987        """
7988        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
7989        exploding the HGVS field and updating variant information accordingly.
7990
7991        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
7992        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
7993        it indicates that the output should be unique, meaning that duplicate entries should be removed,
7994        defaults to True
7995        :type uniquify: bool (optional)
7996        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
7997        function specifies the format in which the output annotations will be generated. It has a
7998        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
7999        format, defaults to fields
8000        :type output_format: str (optional)
8001        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
8002        method is used to specify the prefix that will be added to the output annotations generated
8003        during the calculation process. This prefix helps to differentiate the newly added annotations
8004        from existing ones in the output data. By default, the, defaults to ANN_
8005        :type output_prefix: str (optional)
8006        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
8007        function is used to specify the field in the VCF file that contains SnpEff annotations. This
8008        field will be processed to explode the HGVS annotations and update the variant information
8009        accordingly, defaults to ANN
8010        :type snpeff_field: str (optional)
8011        """
8012
8013        # SnpEff annotation field
8014        snpeff_hgvs = "snpeff_ann_explode"
8015
8016        # Snpeff hgvs tags
8017        vcf_infos_tags = {
8018            snpeff_hgvs: "Explode snpEff annotations",
8019        }
8020
8021        # Prefix
8022        prefix = self.get_explode_infos_prefix()
8023        if prefix:
8024            prefix = "INFO/"
8025
8026        # snpEff fields
8027        speff_ann_infos = prefix + snpeff_field
8028        speff_hgvs_infos = prefix + snpeff_hgvs
8029
8030        # Variants table
8031        table_variants = self.get_table_variants()
8032
8033        # Header
8034        vcf_reader = self.get_header()
8035
8036        # Add columns
8037        added_columns = []
8038
8039        # Explode HGVS field in column
8040        added_columns += self.explode_infos(fields=[snpeff_field])
8041        log.debug(f"snpeff_field={snpeff_field}")
8042        log.debug(f"added_columns={added_columns}")
8043
8044        if snpeff_field in vcf_reader.infos:
8045
8046            # Extract ANN header
8047            ann_description = vcf_reader.infos[snpeff_field].desc
8048            pattern = r"'(.+?)'"
8049            match = re.search(pattern, ann_description)
8050            if match:
8051                ann_header_match = match.group(1).split(" | ")
8052                ann_header = []
8053                ann_header_desc = {}
8054                for i in range(len(ann_header_match)):
8055                    ann_header_info = "".join(
8056                        char for char in ann_header_match[i] if char.isalnum()
8057                    )
8058                    ann_header.append(ann_header_info)
8059                    ann_header_desc[ann_header_info] = ann_header_match[i]
8060                if not ann_header_desc:
8061                    raise ValueError("Invalid header description format")
8062            else:
8063                raise ValueError("Invalid header description format")
8064
8065            # Create variant id
8066            variant_id_column = self.get_variant_id_column()
8067            added_columns += [variant_id_column]
8068
8069            # Create dataframe
8070            dataframe_snpeff_hgvs = self.get_query_to_df(
8071                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8072            )
8073
8074            # Create snpEff columns
8075            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8076                speff_ann_infos
8077            ].apply(
8078                lambda x: explode_snpeff_ann(
8079                    str(x),
8080                    uniquify=uniquify,
8081                    output_format=output_format,
8082                    prefix=output_prefix,
8083                    header=list(ann_header_desc.values()),
8084                )
8085            )
8086
8087            # Header
8088            ann_annotations_prefix = ""
8089            if output_format.upper() in ["JSON"]:
8090                ann_annotations_prefix = f"{output_prefix}="
8091                vcf_reader.infos[output_prefix] = vcf.parser._Info(
8092                    output_prefix,
8093                    ".",
8094                    "String",
8095                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8096                    + " - JSON format",
8097                    "howard calculation",
8098                    "0",
8099                    self.code_type_map.get("String"),
8100                )
8101            else:
8102                for ann_annotation in ann_header:
8103                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
8104                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
8105                        ann_annotation_id,
8106                        ".",
8107                        "String",
8108                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8109                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
8110                        "howard calculation",
8111                        "0",
8112                        self.code_type_map.get("String"),
8113                    )
8114
8115            # Update
8116            sql_update = f"""
8117                UPDATE variants
8118                SET "INFO" = 
8119                    concat(
8120                        CASE
8121                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8122                            THEN ''
8123                            ELSE concat("INFO", ';')
8124                        END,
8125                        CASE 
8126                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8127                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8128                            THEN concat(
8129                                '{ann_annotations_prefix}',
8130                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8131                                )
8132                            ELSE ''
8133                        END
8134                    )
8135                FROM dataframe_snpeff_hgvs
8136                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8137
8138            """
8139            self.conn.execute(sql_update)
8140
8141            # Delete dataframe
8142            del dataframe_snpeff_hgvs
8143            gc.collect()
8144
8145        else:
8146
8147            log.warning(
8148                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8149            )
8150
8151        # Remove added columns
8152        for added_column in added_columns:
8153            self.drop_column(column=added_column)
8154
8155    def calculation_extract_nomen(self) -> None:
8156        """
8157        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8158        """
8159
8160        # NOMEN field
8161        field_nomen_dict = "NOMEN_DICT"
8162
8163        # NOMEN structure
8164        nomen_dict = {
8165            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
8166            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
8167            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
8168            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
8169            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
8170            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
8171            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
8172            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
8173            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
8174            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
8175        }
8176
8177        # Param
8178        param = self.get_param()
8179
8180        # Prefix
8181        prefix = self.get_explode_infos_prefix()
8182
8183        # Header
8184        vcf_reader = self.get_header()
8185
8186        # Get HGVS field
8187        hgvs_field = (
8188            param.get("calculation", {})
8189            .get("calculations", {})
8190            .get("NOMEN", {})
8191            .get("options", {})
8192            .get("hgvs_field", "hgvs")
8193        )
8194
8195        # Get transcripts
8196        transcripts_file = (
8197            param.get("calculation", {})
8198            .get("calculations", {})
8199            .get("NOMEN", {})
8200            .get("options", {})
8201            .get("transcripts", None)
8202        )
8203        transcripts_file = full_path(transcripts_file)
8204        transcripts = []
8205        if transcripts_file:
8206            if os.path.exists(transcripts_file):
8207                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
8208                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
8209            else:
8210                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
8211                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
8212
8213        # Added columns
8214        added_columns = []
8215
8216        # Explode HGVS field in column
8217        added_columns += self.explode_infos(fields=[hgvs_field])
8218
8219        # extra infos
8220        extra_infos = self.get_extra_infos()
8221        extra_field = prefix + hgvs_field
8222
8223        if extra_field in extra_infos:
8224
8225            # Create dataframe
8226            dataframe_hgvs = self.get_query_to_df(
8227                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
8228            )
8229
8230            # Create main NOMEN column
8231            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
8232                lambda x: find_nomen(str(x), transcripts=transcripts)
8233            )
8234
8235            # Explode NOMEN Structure and create SQL set for update
8236            sql_nomen_fields = []
8237            for nomen_field in nomen_dict:
8238
8239                # Explode each field into a column
8240                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
8241                    lambda x: dict(x).get(nomen_field, "")
8242                )
8243
8244                # Create VCF header field
8245                vcf_reader.infos[nomen_field] = vcf.parser._Info(
8246                    nomen_field,
8247                    ".",
8248                    "String",
8249                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
8250                    "howard calculation",
8251                    "0",
8252                    self.code_type_map.get("String"),
8253                )
8254                sql_nomen_fields.append(
8255                    f"""
8256                        CASE 
8257                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
8258                            THEN concat(
8259                                    ';{nomen_field}=',
8260                                    dataframe_hgvs."{nomen_field}"
8261                                )
8262                            ELSE ''
8263                        END
8264                    """
8265                )
8266
8267            # SQL set for update
8268            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
8269
8270            # Update
8271            sql_update = f"""
8272                UPDATE variants
8273                SET "INFO" = 
8274                    concat(
8275                        CASE
8276                            WHEN "INFO" IS NULL
8277                            THEN ''
8278                            ELSE "INFO"
8279                        END,
8280                        {sql_nomen_fields_set}
8281                    )
8282                FROM dataframe_hgvs
8283                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
8284                    AND variants."POS" = dataframe_hgvs."POS" 
8285                    AND variants."REF" = dataframe_hgvs."REF"
8286                    AND variants."ALT" = dataframe_hgvs."ALT"
8287            """
8288            self.conn.execute(sql_update)
8289
8290            # Delete dataframe
8291            del dataframe_hgvs
8292            gc.collect()
8293
8294        # Remove added columns
8295        for added_column in added_columns:
8296            self.drop_column(column=added_column)
8297
8298    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
8299        """
8300        The function `calculation_find_by_pipeline` performs a calculation to find the number of
8301        pipeline/sample for a variant and updates the variant information in a VCF file.
8302
8303        :param tag: The `tag` parameter is a string that represents the annotation field for the
8304        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
8305        VCF header and to update the corresponding field in the variants table, defaults to
8306        findbypipeline
8307        :type tag: str (optional)
8308        """
8309
8310        # if FORMAT and samples
8311        if (
8312            "FORMAT" in self.get_header_columns_as_list()
8313            and self.get_header_sample_list()
8314        ):
8315
8316            # findbypipeline annotation field
8317            findbypipeline_tag = tag
8318
8319            # VCF infos tags
8320            vcf_infos_tags = {
8321                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
8322            }
8323
8324            # Prefix
8325            prefix = self.get_explode_infos_prefix()
8326
8327            # Field
8328            findbypipeline_infos = prefix + findbypipeline_tag
8329
8330            # Variants table
8331            table_variants = self.get_table_variants()
8332
8333            # Header
8334            vcf_reader = self.get_header()
8335
8336            # Create variant id
8337            variant_id_column = self.get_variant_id_column()
8338            added_columns = [variant_id_column]
8339
8340            # variant_id, FORMAT and samples
8341            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8342                self.get_header_sample_list()
8343            )
8344
8345            # Create dataframe
8346            dataframe_findbypipeline = self.get_query_to_df(
8347                f""" SELECT {samples_fields} FROM {table_variants} """
8348            )
8349
8350            # Create findbypipeline column
8351            dataframe_findbypipeline[findbypipeline_infos] = (
8352                dataframe_findbypipeline.apply(
8353                    lambda row: findbypipeline(
8354                        row, samples=self.get_header_sample_list()
8355                    ),
8356                    axis=1,
8357                )
8358            )
8359
8360            # Add snpeff_hgvs to header
8361            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
8362                findbypipeline_tag,
8363                ".",
8364                "String",
8365                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
8366                "howard calculation",
8367                "0",
8368                self.code_type_map.get("String"),
8369            )
8370
8371            # Update
8372            sql_update = f"""
8373                UPDATE variants
8374                SET "INFO" = 
8375                    concat(
8376                        CASE
8377                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8378                            THEN ''
8379                            ELSE concat("INFO", ';')
8380                        END,
8381                        CASE 
8382                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
8383                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
8384                            THEN concat(
8385                                    '{findbypipeline_tag}=',
8386                                    dataframe_findbypipeline."{findbypipeline_infos}"
8387                                )
8388                            ELSE ''
8389                        END
8390                    )
8391                FROM dataframe_findbypipeline
8392                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
8393            """
8394            self.conn.execute(sql_update)
8395
8396            # Remove added columns
8397            for added_column in added_columns:
8398                self.drop_column(column=added_column)
8399
8400            # Delete dataframe
8401            del dataframe_findbypipeline
8402            gc.collect()
8403
8404    def calculation_genotype_concordance(self) -> None:
8405        """
8406        The function `calculation_genotype_concordance` calculates the genotype concordance for
8407        multi-caller VCF files and updates the variant information in the database.
8408        """
8409
8410        # if FORMAT and samples
8411        if (
8412            "FORMAT" in self.get_header_columns_as_list()
8413            and self.get_header_sample_list()
8414        ):
8415
8416            # genotypeconcordance annotation field
8417            genotypeconcordance_tag = "genotypeconcordance"
8418
8419            # VCF infos tags
8420            vcf_infos_tags = {
8421                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
8422            }
8423
8424            # Prefix
8425            prefix = self.get_explode_infos_prefix()
8426
8427            # Field
8428            genotypeconcordance_infos = prefix + genotypeconcordance_tag
8429
8430            # Variants table
8431            table_variants = self.get_table_variants()
8432
8433            # Header
8434            vcf_reader = self.get_header()
8435
8436            # Create variant id
8437            variant_id_column = self.get_variant_id_column()
8438            added_columns = [variant_id_column]
8439
8440            # variant_id, FORMAT and samples
8441            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8442                self.get_header_sample_list()
8443            )
8444
8445            # Create dataframe
8446            dataframe_genotypeconcordance = self.get_query_to_df(
8447                f""" SELECT {samples_fields} FROM {table_variants} """
8448            )
8449
8450            # Create genotypeconcordance column
8451            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
8452                dataframe_genotypeconcordance.apply(
8453                    lambda row: genotypeconcordance(
8454                        row, samples=self.get_header_sample_list()
8455                    ),
8456                    axis=1,
8457                )
8458            )
8459
8460            # Add genotypeconcordance to header
8461            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
8462                genotypeconcordance_tag,
8463                ".",
8464                "String",
8465                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
8466                "howard calculation",
8467                "0",
8468                self.code_type_map.get("String"),
8469            )
8470
8471            # Update
8472            sql_update = f"""
8473                UPDATE variants
8474                SET "INFO" = 
8475                    concat(
8476                        CASE
8477                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8478                            THEN ''
8479                            ELSE concat("INFO", ';')
8480                        END,
8481                        CASE
8482                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
8483                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
8484                            THEN concat(
8485                                    '{genotypeconcordance_tag}=',
8486                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
8487                                )
8488                            ELSE ''
8489                        END
8490                    )
8491                FROM dataframe_genotypeconcordance
8492                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
8493            """
8494            self.conn.execute(sql_update)
8495
8496            # Remove added columns
8497            for added_column in added_columns:
8498                self.drop_column(column=added_column)
8499
8500            # Delete dataframe
8501            del dataframe_genotypeconcordance
8502            gc.collect()
8503
8504    def calculation_barcode(self, tag: str = "barcode") -> None:
8505        """
8506        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
8507        updates the INFO field in the file with the calculated barcode values.
8508        
8509        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
8510        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
8511        the default tag name is set to "barcode", defaults to barcode
8512        :type tag: str (optional)
8513        """
8514
8515        # if FORMAT and samples
8516        if (
8517            "FORMAT" in self.get_header_columns_as_list()
8518            and self.get_header_sample_list()
8519        ):
8520
8521            # barcode annotation field
8522            if not tag:
8523                tag = "barcode"
8524
8525            # VCF infos tags
8526            vcf_infos_tags = {
8527                tag: "barcode calculation (VaRank)",
8528            }
8529
8530            # Prefix
8531            prefix = self.get_explode_infos_prefix()
8532
8533            # Field
8534            barcode_infos = prefix + tag
8535
8536            # Variants table
8537            table_variants = self.get_table_variants()
8538
8539            # Header
8540            vcf_reader = self.get_header()
8541
8542            # Create variant id
8543            variant_id_column = self.get_variant_id_column()
8544            added_columns = [variant_id_column]
8545
8546            # variant_id, FORMAT and samples
8547            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8548                self.get_header_sample_list()
8549            )
8550
8551            # Create dataframe
8552            dataframe_barcode = self.get_query_to_df(
8553                f""" SELECT {samples_fields} FROM {table_variants} """
8554            )
8555
8556            # Create barcode column
8557            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8558                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
8559            )
8560
8561            # Add barcode to header
8562            vcf_reader.infos[tag] = vcf.parser._Info(
8563                tag,
8564                ".",
8565                "String",
8566                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
8567                "howard calculation",
8568                "0",
8569                self.code_type_map.get("String"),
8570            )
8571
8572            # Update
8573            sql_update = f"""
8574                UPDATE {table_variants}
8575                SET "INFO" = 
8576                    concat(
8577                        CASE
8578                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8579                            THEN ''
8580                            ELSE concat("INFO", ';')
8581                        END,
8582                        CASE
8583                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
8584                            AND dataframe_barcode."{barcode_infos}" NOT NULL
8585                            THEN concat(
8586                                    '{tag}=',
8587                                    dataframe_barcode."{barcode_infos}"
8588                                )
8589                            ELSE ''
8590                        END
8591                    )
8592                FROM dataframe_barcode
8593                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8594            """
8595            self.conn.execute(sql_update)
8596
8597            # Remove added columns
8598            for added_column in added_columns:
8599                self.drop_column(column=added_column)
8600
8601            # Delete dataframe
8602            del dataframe_barcode
8603            gc.collect()
8604
8605    def calculation_barcode_family(self, tag: str = "BCF") -> None:
8606        """
8607        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
8608        and updates the INFO field in the file with the calculated barcode values.
8609
8610        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
8611        the barcode tag that will be added to the VCF file during the calculation process. If no value
8612        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
8613        :type tag: str (optional)
8614        """
8615
8616        # if FORMAT and samples
8617        if (
8618            "FORMAT" in self.get_header_columns_as_list()
8619            and self.get_header_sample_list()
8620        ):
8621
8622            # barcode annotation field
8623            if not tag:
8624                tag = "BCF"
8625
8626            # VCF infos tags
8627            vcf_infos_tags = {
8628                tag: "barcode family calculation",
8629                f"{tag}S": "barcode family samples",
8630            }
8631
8632            # Param
8633            param = self.get_param()
8634            log.debug(f"param={param}")
8635
8636            # Prefix
8637            prefix = self.get_explode_infos_prefix()
8638
8639            # PED param
8640            ped = (
8641                param.get("calculation", {})
8642                .get("calculations", {})
8643                .get("BARCODEFAMILY", {})
8644                .get("family_pedigree", None)
8645            )
8646            log.debug(f"ped={ped}")
8647
8648            # Load PED
8649            if ped:
8650
8651                # Pedigree is a file
8652                if isinstance(ped, str) and os.path.exists(full_path(ped)):
8653                    log.debug("Pedigree is file")
8654                    with open(full_path(ped)) as ped:
8655                        ped = json.load(ped)
8656
8657                # Pedigree is a string
8658                elif isinstance(ped, str):
8659                    log.debug("Pedigree is str")
8660                    try:
8661                        ped = json.loads(ped)
8662                        log.debug("Pedigree is json str")
8663                    except ValueError as e:
8664                        ped_samples = ped.split(",")
8665                        ped = {}
8666                        for ped_sample in ped_samples:
8667                            ped[ped_sample] = ped_sample
8668
8669                # Pedigree is a dict
8670                elif isinstance(ped, dict):
8671                    log.debug("Pedigree is dict")
8672
8673                # Pedigree is not well formatted
8674                else:
8675                    msg_error = "Pedigree not well formatted"
8676                    log.error(msg_error)
8677                    raise ValueError(msg_error)
8678
8679                # Construct list
8680                ped_samples = list(ped.values())
8681
8682            else:
8683                log.debug("Pedigree not defined. Take all samples")
8684                ped_samples = self.get_header_sample_list()
8685                ped = {}
8686                for ped_sample in ped_samples:
8687                    ped[ped_sample] = ped_sample
8688
8689            # Check pedigree
8690            if not ped or len(ped) == 0:
8691                msg_error = f"Error in pedigree: samples {ped_samples}"
8692                log.error(msg_error)
8693                raise ValueError(msg_error)
8694
8695            # Log
8696            log.info(
8697                "Calculation 'BARCODEFAMILY' - Samples: "
8698                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
8699            )
8700            log.debug(f"ped_samples={ped_samples}")
8701
8702            # Field
8703            barcode_infos = prefix + tag
8704
8705            # Variants table
8706            table_variants = self.get_table_variants()
8707
8708            # Header
8709            vcf_reader = self.get_header()
8710
8711            # Create variant id
8712            variant_id_column = self.get_variant_id_column()
8713            added_columns = [variant_id_column]
8714
8715            # variant_id, FORMAT and samples
8716            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8717                ped_samples
8718            )
8719
8720            # Create dataframe
8721            dataframe_barcode = self.get_query_to_df(
8722                f""" SELECT {samples_fields} FROM {table_variants} """
8723            )
8724
8725            # Create barcode column
8726            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8727                lambda row: barcode(row, samples=ped_samples), axis=1
8728            )
8729
8730            # Add barcode family to header
8731            # Add vaf_normalization to header
8732            vcf_reader.formats[tag] = vcf.parser._Format(
8733                id=tag,
8734                num=".",
8735                type="String",
8736                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
8737                type_code=self.code_type_map.get("String"),
8738            )
8739            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
8740                id=f"{tag}S",
8741                num=".",
8742                type="String",
8743                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
8744                type_code=self.code_type_map.get("String"),
8745            )
8746
8747            # Update
8748            # for sample in ped_samples:
8749            sql_update_set = []
8750            for sample in self.get_header_sample_list() + ["FORMAT"]:
8751                if sample in ped_samples:
8752                    value = f'dataframe_barcode."{barcode_infos}"'
8753                    value_samples = "'" + ",".join(ped_samples) + "'"
8754                elif sample == "FORMAT":
8755                    value = f"'{tag}'"
8756                    value_samples = f"'{tag}S'"
8757                else:
8758                    value = "'.'"
8759                    value_samples = "'.'"
8760                format_regex = r"[a-zA-Z0-9\s]"
8761                sql_update_set.append(
8762                    f"""
8763                        "{sample}" = 
8764                        concat(
8765                            CASE
8766                                WHEN {table_variants}."{sample}" = './.'
8767                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
8768                                ELSE {table_variants}."{sample}"
8769                            END,
8770                            ':',
8771                            {value},
8772                            ':',
8773                            {value_samples}
8774                        )
8775                    """
8776                )
8777
8778            sql_update_set_join = ", ".join(sql_update_set)
8779            sql_update = f"""
8780                UPDATE {table_variants}
8781                SET {sql_update_set_join}
8782                FROM dataframe_barcode
8783                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8784            """
8785            self.conn.execute(sql_update)
8786
8787            # Remove added columns
8788            for added_column in added_columns:
8789                self.drop_column(column=added_column)
8790
8791            # Delete dataframe
8792            del dataframe_barcode
8793            gc.collect()
8794
8795    def calculation_trio(self) -> None:
8796        """
8797        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
8798        information to the INFO field of each variant.
8799        """
8800
8801        # if FORMAT and samples
8802        if (
8803            "FORMAT" in self.get_header_columns_as_list()
8804            and self.get_header_sample_list()
8805        ):
8806
8807            # trio annotation field
8808            trio_tag = "trio"
8809
8810            # VCF infos tags
8811            vcf_infos_tags = {
8812                "trio": "trio calculation",
8813            }
8814
8815            # Param
8816            param = self.get_param()
8817
8818            # Prefix
8819            prefix = self.get_explode_infos_prefix()
8820
8821            # Trio param
8822            trio_ped = (
8823                param.get("calculation", {})
8824                .get("calculations", {})
8825                .get("TRIO", {})
8826                .get("trio_pedigree", None)
8827            )
8828
8829            # Load trio
8830            if trio_ped:
8831
8832                # Trio pedigree is a file
8833                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
8834                    log.debug("TRIO pedigree is file")
8835                    with open(full_path(trio_ped)) as trio_ped:
8836                        trio_ped = json.load(trio_ped)
8837
8838                # Trio pedigree is a string
8839                elif isinstance(trio_ped, str):
8840                    log.debug("TRIO pedigree is str")
8841                    try:
8842                        trio_ped = json.loads(trio_ped)
8843                        log.debug("TRIO pedigree is json str")
8844                    except ValueError as e:
8845                        trio_samples = trio_ped.split(",")
8846                        if len(trio_samples) == 3:
8847                            trio_ped = {
8848                                "father": trio_samples[0],
8849                                "mother": trio_samples[1],
8850                                "child": trio_samples[2],
8851                            }
8852                            log.debug("TRIO pedigree is list str")
8853                        else:
8854                            msg_error = "TRIO pedigree not well formatted"
8855                            log.error(msg_error)
8856                            raise ValueError(msg_error)
8857
8858                # Trio pedigree is a dict
8859                elif isinstance(trio_ped, dict):
8860                    log.debug("TRIO pedigree is dict")
8861
8862                # Trio pedigree is not well formatted
8863                else:
8864                    msg_error = "TRIO pedigree not well formatted"
8865                    log.error(msg_error)
8866                    raise ValueError(msg_error)
8867
8868                # Construct trio list
8869                trio_samples = [
8870                    trio_ped.get("father", ""),
8871                    trio_ped.get("mother", ""),
8872                    trio_ped.get("child", ""),
8873                ]
8874
8875            else:
8876                log.debug("TRIO pedigree not defined. Take the first 3 samples")
8877                samples_list = self.get_header_sample_list()
8878                if len(samples_list) >= 3:
8879                    trio_samples = self.get_header_sample_list()[0:3]
8880                    trio_ped = {
8881                        "father": trio_samples[0],
8882                        "mother": trio_samples[1],
8883                        "child": trio_samples[2],
8884                    }
8885                else:
8886                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
8887                    log.error(msg_error)
8888                    raise ValueError(msg_error)
8889
8890            # Check trio pedigree
8891            if not trio_ped or len(trio_ped) != 3:
8892                msg_error = f"Error in TRIO pedigree: {trio_ped}"
8893                log.error(msg_error)
8894                raise ValueError(msg_error)
8895
8896            # Log
8897            log.info(
8898                f"Calculation 'TRIO' - Samples: "
8899                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
8900            )
8901
8902            # Field
8903            trio_infos = prefix + trio_tag
8904
8905            # Variants table
8906            table_variants = self.get_table_variants()
8907
8908            # Header
8909            vcf_reader = self.get_header()
8910
8911            # Create variant id
8912            variant_id_column = self.get_variant_id_column()
8913            added_columns = [variant_id_column]
8914
8915            # variant_id, FORMAT and samples
8916            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8917                self.get_header_sample_list()
8918            )
8919
8920            # Create dataframe
8921            dataframe_trio = self.get_query_to_df(
8922                f""" SELECT {samples_fields} FROM {table_variants} """
8923            )
8924
8925            # Create trio column
8926            dataframe_trio[trio_infos] = dataframe_trio.apply(
8927                lambda row: trio(row, samples=trio_samples), axis=1
8928            )
8929
8930            # Add trio to header
8931            vcf_reader.infos[trio_tag] = vcf.parser._Info(
8932                trio_tag,
8933                ".",
8934                "String",
8935                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
8936                "howard calculation",
8937                "0",
8938                self.code_type_map.get("String"),
8939            )
8940
8941            # Update
8942            sql_update = f"""
8943                UPDATE {table_variants}
8944                SET "INFO" = 
8945                    concat(
8946                        CASE
8947                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8948                            THEN ''
8949                            ELSE concat("INFO", ';')
8950                        END,
8951                        CASE
8952                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
8953                             AND dataframe_trio."{trio_infos}" NOT NULL
8954                            THEN concat(
8955                                    '{trio_tag}=',
8956                                    dataframe_trio."{trio_infos}"
8957                                )
8958                            ELSE ''
8959                        END
8960                    )
8961                FROM dataframe_trio
8962                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
8963            """
8964            self.conn.execute(sql_update)
8965
8966            # Remove added columns
8967            for added_column in added_columns:
8968                self.drop_column(column=added_column)
8969
8970            # Delete dataframe
8971            del dataframe_trio
8972            gc.collect()
8973
8974    def calculation_vaf_normalization(self) -> None:
8975        """
8976        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
8977        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
8978        :return: The function does not return anything.
8979        """
8980
8981        # if FORMAT and samples
8982        if (
8983            "FORMAT" in self.get_header_columns_as_list()
8984            and self.get_header_sample_list()
8985        ):
8986
8987            # vaf_normalization annotation field
8988            vaf_normalization_tag = "VAF"
8989
8990            # VCF infos tags
8991            vcf_infos_tags = {
8992                "VAF": "VAF Variant Frequency",
8993            }
8994
8995            # Prefix
8996            prefix = self.get_explode_infos_prefix()
8997
8998            # Variants table
8999            table_variants = self.get_table_variants()
9000
9001            # Header
9002            vcf_reader = self.get_header()
9003
9004            # Do not calculate if VAF already exists
9005            if "VAF" in vcf_reader.formats:
9006                log.debug("VAF already on genotypes")
9007                return
9008
9009            # Create variant id
9010            variant_id_column = self.get_variant_id_column()
9011            added_columns = [variant_id_column]
9012
9013            # variant_id, FORMAT and samples
9014            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9015                self.get_header_sample_list()
9016            )
9017
9018            # Create dataframe
9019            dataframe_vaf_normalization = self.get_query_to_df(
9020                f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
9021            )
9022
9023            vaf_normalization_set = []
9024
9025            # for each sample vaf_normalization
9026            for sample in self.get_header_sample_list():
9027                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
9028                    lambda row: vaf_normalization(row, sample=sample), axis=1
9029                )
9030                vaf_normalization_set.append(
9031                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
9032                )
9033
9034            # Add VAF to FORMAT
9035            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
9036                "FORMAT"
9037            ].apply(lambda x: str(x) + ":VAF")
9038            vaf_normalization_set.append(
9039                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
9040            )
9041
9042            # Add vaf_normalization to header
9043            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
9044                id=vaf_normalization_tag,
9045                num="1",
9046                type="Float",
9047                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
9048                type_code=self.code_type_map.get("Float"),
9049            )
9050
9051            # Create fields to add in INFO
9052            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
9053
9054            # Update
9055            sql_update = f"""
9056                UPDATE {table_variants}
9057                SET {sql_vaf_normalization_set}
9058                FROM dataframe_vaf_normalization
9059                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
9060
9061            """
9062            self.conn.execute(sql_update)
9063
9064            # Remove added columns
9065            for added_column in added_columns:
9066                self.drop_column(column=added_column)
9067
9068            # Delete dataframe
9069            del dataframe_vaf_normalization
9070            gc.collect()
9071
9072    def calculation_genotype_stats(self, info: str = "VAF") -> None:
9073        """
9074        The `calculation_genotype_stats` function calculates genotype statistics for a given information
9075        field in a VCF file and updates the INFO column of the variants table with the calculated
9076        statistics.
9077
9078        :param info: The `info` parameter is a string that represents the type of information for which
9079        genotype statistics are calculated. It is used to generate various VCF info tags for the
9080        statistics, such as the number of occurrences, the list of values, the minimum value, the
9081        maximum value, the mean, the median, defaults to VAF
9082        :type info: str (optional)
9083        """
9084
9085        # if FORMAT and samples
9086        if (
9087            "FORMAT" in self.get_header_columns_as_list()
9088            and self.get_header_sample_list()
9089        ):
9090
9091            # vaf_stats annotation field
9092            vaf_stats_tag = info + "_stats"
9093
9094            # VCF infos tags
9095            vcf_infos_tags = {
9096                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
9097                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
9098                info + "_stats_min": f"genotype {info} Statistics - min {info}",
9099                info + "_stats_max": f"genotype {info} Statistics - max {info}",
9100                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
9101                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
9102                info
9103                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
9104            }
9105
9106            # Prefix
9107            prefix = self.get_explode_infos_prefix()
9108
9109            # Field
9110            vaf_stats_infos = prefix + vaf_stats_tag
9111
9112            # Variants table
9113            table_variants = self.get_table_variants()
9114
9115            # Header
9116            vcf_reader = self.get_header()
9117
9118            # Create variant id
9119            variant_id_column = self.get_variant_id_column()
9120            added_columns = [variant_id_column]
9121
9122            # variant_id, FORMAT and samples
9123            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9124                self.get_header_sample_list()
9125            )
9126
9127            # Create dataframe
9128            dataframe_vaf_stats = self.get_query_to_df(
9129                f""" SELECT {samples_fields} FROM {table_variants} """
9130            )
9131
9132            # Create vaf_stats column
9133            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
9134                lambda row: genotype_stats(
9135                    row, samples=self.get_header_sample_list(), info=info
9136                ),
9137                axis=1,
9138            )
9139
9140            # List of vcf tags
9141            sql_vaf_stats_fields = []
9142
9143            # Check all VAF stats infos
9144            for stat in vcf_infos_tags:
9145
9146                # Extract stats
9147                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
9148                    lambda x: dict(x).get(stat, "")
9149                )
9150
9151                # Add snpeff_hgvs to header
9152                vcf_reader.infos[stat] = vcf.parser._Info(
9153                    stat,
9154                    ".",
9155                    "String",
9156                    vcf_infos_tags.get(stat, "genotype statistics"),
9157                    "howard calculation",
9158                    "0",
9159                    self.code_type_map.get("String"),
9160                )
9161
9162                if len(sql_vaf_stats_fields):
9163                    sep = ";"
9164                else:
9165                    sep = ""
9166
9167                # Create fields to add in INFO
9168                sql_vaf_stats_fields.append(
9169                    f"""
9170                        CASE
9171                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
9172                            THEN concat(
9173                                    '{sep}{stat}=',
9174                                    dataframe_vaf_stats."{stat}"
9175                                )
9176                            ELSE ''
9177                        END
9178                    """
9179                )
9180
9181            # SQL set for update
9182            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
9183
9184            # Update
9185            sql_update = f"""
9186                UPDATE variants
9187                SET "INFO" = 
9188                    concat(
9189                        CASE
9190                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9191                            THEN ''
9192                            ELSE concat("INFO", ';')
9193                        END,
9194                        {sql_vaf_stats_fields_set}
9195                    )
9196                FROM dataframe_vaf_stats
9197                WHERE variants."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
9198
9199            """
9200            self.conn.execute(sql_update)
9201
9202            # Remove added columns
9203            for added_column in added_columns:
9204                self.drop_column(column=added_column)
9205
9206            # Delete dataframe
9207            del dataframe_vaf_stats
9208            gc.collect()
Variants( conn=None, input: str = None, output: str = None, config: dict = {}, param: dict = {}, load: bool = False)
36    def __init__(
37        self,
38        conn=None,
39        input: str = None,
40        output: str = None,
41        config: dict = {},
42        param: dict = {},
43        load: bool = False,
44    ) -> None:
45        """
46        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
47        header
48
49        :param conn: the connection to the database
50        :param input: the input file
51        :param output: the output file
52        :param config: a dictionary containing the configuration of the model
53        :param param: a dictionary containing the parameters of the model
54        """
55
56        # Init variables
57        self.init_variables()
58
59        # Input
60        self.set_input(input)
61
62        # Config
63        self.set_config(config)
64
65        # Param
66        self.set_param(param)
67
68        # Output
69        self.set_output(output)
70
71        # connexion
72        self.set_connexion(conn)
73
74        # Header
75        self.set_header()
76
77        # Load data
78        if load:
79            self.load_data()

The function __init__ initializes the variables, sets the input, output, config, param, connexion and header

Parameters
  • conn: the connection to the database
  • input: the input file
  • output: the output file
  • config: a dictionary containing the configuration of the model
  • param: a dictionary containing the parameters of the model
def set_input(self, input: str = None) -> None:
 81    def set_input(self, input: str = None) -> None:
 82        """
 83        The function `set_input` takes a file name as input, extracts the name and extension, and sets
 84        attributes in the class accordingly.
 85        
 86        :param input: The `set_input` method in the provided code snippet is used to set attributes
 87        related to the input file. Here's a breakdown of the parameters and their usage in the method:
 88        :type input: str
 89        """
 90
 91        if input and not isinstance(input, str):
 92            try:
 93                self.input = input.name
 94            except:
 95                log.error(f"Input file '{input} in bad format")
 96                raise ValueError(f"Input file '{input} in bad format")
 97        else:
 98            self.input = input
 99
100        # Input format
101        if input:
102            input_name, input_extension = os.path.splitext(self.input)
103            self.input_name = input_name
104            self.input_extension = input_extension
105            self.input_format = self.input_extension.replace(".", "")

The function set_input takes a file name as input, extracts the name and extension, and sets attributes in the class accordingly.

Parameters
  • input: The set_input method in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
def set_config(self, config: dict) -> None:
107    def set_config(self, config: dict) -> None:
108        """
109        The set_config function takes a config object and assigns it as the configuration object for the
110        class.
111        
112        :param config: The `config` parameter in the `set_config` function is a dictionary object that
113        contains configuration settings for the class. When you call the `set_config` function with a
114        dictionary object as the argument, it will set that dictionary as the configuration object for
115        the class
116        :type config: dict
117        """
118
119        self.config = config

The set_config function takes a config object and assigns it as the configuration object for the class.

Parameters
  • config: The config parameter in the set_config function is a dictionary object that contains configuration settings for the class. When you call the set_config function with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
def set_param(self, param: dict) -> None:
121    def set_param(self, param: dict) -> None:
122        """
123        This function sets a parameter object for the class based on the input dictionary.
124        
125        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
126        as the `param` attribute of the class instance
127        :type param: dict
128        """
129
130        self.param = param

This function sets a parameter object for the class based on the input dictionary.

Parameters
  • param: The set_param method you provided takes a dictionary object as input and sets it as the param attribute of the class instance
def init_variables(self) -> None:
132    def init_variables(self) -> None:
133        """
134        This function initializes the variables that will be used in the rest of the class
135        """
136
137        self.prefix = "howard"
138        self.table_variants = "variants"
139        self.dataframe = None
140
141        self.comparison_map = {
142            "gt": ">",
143            "gte": ">=",
144            "lt": "<",
145            "lte": "<=",
146            "equals": "=",
147            "contains": "SIMILAR TO",
148        }
149
150        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
151
152        self.code_type_map_to_sql = {
153            "Integer": "INTEGER",
154            "String": "VARCHAR",
155            "Float": "FLOAT",
156            "Flag": "VARCHAR",
157        }
158
159        self.index_additionnal_fields = []

This function initializes the variables that will be used in the rest of the class

def get_indexing(self) -> bool:
161    def get_indexing(self) -> bool:
162        """
163        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
164        returns False.
165        :return: The value of the indexing parameter.
166        """
167
168        return self.get_param().get("indexing", False)

It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.

Returns

The value of the indexing parameter.

def get_connexion_config(self) -> dict:
170    def get_connexion_config(self) -> dict:
171        """
172        The function `get_connexion_config` returns a dictionary containing the configuration for a
173        connection, including the number of threads and memory limit.
174        :return: a dictionary containing the configuration for the Connexion library.
175        """
176
177        # config
178        config = self.get_config()
179
180        # Connexion config
181        connexion_config = {}
182        threads = self.get_threads()
183
184        # Threads
185        if threads:
186            connexion_config["threads"] = threads
187
188        # Memory
189        # if config.get("memory", None):
190        #     connexion_config["memory_limit"] = config.get("memory")
191        if self.get_memory():
192            connexion_config["memory_limit"] = self.get_memory()
193
194        # Temporary directory
195        if config.get("tmp", None):
196            connexion_config["temp_directory"] = config.get("tmp")
197
198        # Access
199        if config.get("access", None):
200            access = config.get("access")
201            if access in ["RO"]:
202                access = "READ_ONLY"
203            elif access in ["RW"]:
204                access = "READ_WRITE"
205            connexion_db = self.get_connexion_db()
206            if connexion_db in ":memory:":
207                access = "READ_WRITE"
208            connexion_config["access_mode"] = access
209
210        return connexion_config

The function get_connexion_config returns a dictionary containing the configuration for a connection, including the number of threads and memory limit.

Returns

a dictionary containing the configuration for the Connexion library.

def get_duckdb_settings(self) -> dict:
212    def get_duckdb_settings(self) -> dict:
213        """
214        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
215        string.
216        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
217        """
218
219        # config
220        config = self.get_config()
221
222        # duckdb settings
223        duckdb_settings_dict = {}
224        if config.get("duckdb_settings", None):
225            duckdb_settings = config.get("duckdb_settings")
226            duckdb_settings = full_path(duckdb_settings)
227            # duckdb setting is a file
228            if os.path.exists(duckdb_settings):
229                with open(duckdb_settings) as json_file:
230                    duckdb_settings_dict = yaml.safe_load(json_file)
231            # duckdb settings is a string
232            else:
233                duckdb_settings_dict = json.loads(duckdb_settings)
234
235        return duckdb_settings_dict

The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a string.

Returns

The function get_duckdb_settings returns a dictionary object duckdb_settings_dict.

def set_connexion_db(self) -> str:
237    def set_connexion_db(self) -> str:
238        """
239        The function `set_connexion_db` returns the appropriate database connection string based on the
240        input format and connection type.
241        :return: the value of the variable `connexion_db`.
242        """
243
244        # Default connexion db
245        default_connexion_db = ":memory:"
246
247        # Find connexion db
248        if self.get_input_format() in ["db", "duckdb"]:
249            connexion_db = self.get_input()
250        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
251            connexion_db = default_connexion_db
252        elif self.get_connexion_type() in ["tmpfile"]:
253            tmp_name = tempfile.mkdtemp(
254                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
255            )
256            connexion_db = f"{tmp_name}/tmp.db"
257        elif self.get_connexion_type() != "":
258            connexion_db = self.get_connexion_type()
259        else:
260            connexion_db = default_connexion_db
261
262        # Set connexion db
263        self.connexion_db = connexion_db
264
265        return connexion_db

The function set_connexion_db returns the appropriate database connection string based on the input format and connection type.

Returns

the value of the variable connexion_db.

def set_connexion(self, conn) -> None:
267    def set_connexion(self, conn) -> None:
268        """
269        The function `set_connexion` creates a connection to a database, with options for different
270        database formats and settings.
271        
272        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
273        database. If a connection is not provided, a new connection to an in-memory database is created.
274        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
275        sqlite
276        """
277
278        # Connexion db
279        connexion_db = self.set_connexion_db()
280
281        # Connexion config
282        connexion_config = self.get_connexion_config()
283
284        # Connexion format
285        connexion_format = self.get_config().get("connexion_format", "duckdb")
286        # Set connexion format
287        self.connexion_format = connexion_format
288
289        # Connexion
290        if not conn:
291            if connexion_format in ["duckdb"]:
292                conn = duckdb.connect(connexion_db, config=connexion_config)
293                # duckDB settings
294                duckdb_settings = self.get_duckdb_settings()
295                if duckdb_settings:
296                    for setting in duckdb_settings:
297                        setting_value = duckdb_settings.get(setting)
298                        if isinstance(setting_value, str):
299                            setting_value = f"'{setting_value}'"
300                        conn.execute(f"PRAGMA {setting}={setting_value};")
301            elif connexion_format in ["sqlite"]:
302                conn = sqlite3.connect(connexion_db)
303
304        # Set connexion
305        self.conn = conn
306
307        # Log
308        log.debug(f"connexion_format: {connexion_format}")
309        log.debug(f"connexion_db: {connexion_db}")
310        log.debug(f"connexion config: {connexion_config}")
311        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")

The function set_connexion creates a connection to a database, with options for different database formats and settings.

Parameters
  • conn: The conn parameter in the set_connexion method is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
def set_output(self, output: str = None) -> None:
313    def set_output(self, output: str = None) -> None:
314        """
315        The `set_output` function in Python sets the output file based on the input or a specified key
316        in the config file, extracting the output name, extension, and format.
317        
318        :param output: The `output` parameter in the `set_output` method is used to specify the name of
319        the output file. If the config file has an 'output' key, the method sets the output to the value
320        of that key. If no output is provided, it sets the output to `None`
321        :type output: str
322        """
323
324        if output and not isinstance(output, str):
325            self.output = output.name
326        else:
327            self.output = output
328
329        # Output format
330        if self.output:
331            output_name, output_extension = os.path.splitext(self.output)
332            self.output_name = output_name
333            self.output_extension = output_extension
334            self.output_format = self.output_extension.replace(".", "")
335        else:
336            self.output_name = None
337            self.output_extension = None
338            self.output_format = None

The set_output function in Python sets the output file based on the input or a specified key in the config file, extracting the output name, extension, and format.

Parameters
  • output: The output parameter in the set_output method is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output to None
def set_header(self) -> None:
340    def set_header(self) -> None:
341        """
342        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
343        """
344
345        input_file = self.get_input()
346        default_header_list = [
347            "##fileformat=VCFv4.2",
348            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
349        ]
350
351        # Full path
352        input_file = full_path(input_file)
353
354        if input_file:
355
356            input_format = self.get_input_format()
357            input_compressed = self.get_input_compressed()
358            config = self.get_config()
359            header_list = default_header_list
360            if input_format in [
361                "vcf",
362                "hdr",
363                "tsv",
364                "csv",
365                "psv",
366                "parquet",
367                "db",
368                "duckdb",
369            ]:
370                # header provided in param
371                if config.get("header_file", None):
372                    with open(config.get("header_file"), "rt") as f:
373                        header_list = self.read_vcf_header(f)
374                # within a vcf file format (header within input file itsself)
375                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
376                    # within a compressed vcf file format (.vcf.gz)
377                    if input_compressed:
378                        with bgzf.open(input_file, "rt") as f:
379                            header_list = self.read_vcf_header(f)
380                    # within an uncompressed vcf file format (.vcf)
381                    else:
382                        with open(input_file, "rt") as f:
383                            header_list = self.read_vcf_header(f)
384                # header provided in default external file .hdr
385                elif os.path.exists((input_file + ".hdr")):
386                    with open(input_file + ".hdr", "rt") as f:
387                        header_list = self.read_vcf_header(f)
388                else:
389                    try:  # Try to get header info fields and file columns
390
391                        with tempfile.TemporaryDirectory() as tmpdir:
392
393                            # Create database
394                            db_for_header = Database(database=input_file)
395
396                            # Get header columns for infos fields
397                            db_header_from_columns = (
398                                db_for_header.get_header_from_columns()
399                            )
400
401                            # Get real columns in the file
402                            db_header_columns = db_for_header.get_columns()
403
404                            # Write header file
405                            header_file_tmp = os.path.join(tmpdir, "header")
406                            f = open(header_file_tmp, "w")
407                            vcf.Writer(f, db_header_from_columns)
408                            f.close()
409
410                            # Replace #CHROM line with rel columns
411                            header_list = db_for_header.read_header_file(
412                                header_file=header_file_tmp
413                            )
414                            header_list[-1] = "\t".join(db_header_columns)
415
416                    except:
417
418                        log.warning(
419                            f"No header for file {input_file}. Set as default VCF header"
420                        )
421                        header_list = default_header_list
422
423            else:  # try for unknown format ?
424
425                log.error(f"Input file format '{input_format}' not available")
426                raise ValueError(f"Input file format '{input_format}' not available")
427
428            if not header_list:
429                header_list = default_header_list
430
431            # header as list
432            self.header_list = header_list
433
434            # header as VCF object
435            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
436
437        else:
438
439            self.header_list = None
440            self.header_vcf = None

It reads the header of a VCF file and stores it as a list of strings and as a VCF object

def get_query_to_df(self, query: str = '', limit: int = None) -> pandas.core.frame.DataFrame:
442    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
443        """
444        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
445        DataFrame based on the connection format.
446
447        :param query: The `query` parameter in the `get_query_to_df` function is a string that
448        represents the SQL query you want to execute. This query will be used to fetch data from a
449        database and convert it into a pandas DataFrame
450        :type query: str
451        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
452        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
453        function will only fetch up to that number of rows from the database query result. If no limit
454        is specified,
455        :type limit: int
456        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
457        """
458
459        # Connexion format
460        connexion_format = self.get_connexion_format()
461
462        # Limit in query
463        if limit:
464            pd.set_option("display.max_rows", limit)
465            if connexion_format in ["duckdb"]:
466                df = (
467                    self.conn.execute(query)
468                    .fetch_record_batch(limit)
469                    .read_next_batch()
470                    .to_pandas()
471                )
472            elif connexion_format in ["sqlite"]:
473                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
474
475        # Full query
476        else:
477            if connexion_format in ["duckdb"]:
478                df = self.conn.execute(query).df()
479            elif connexion_format in ["sqlite"]:
480                df = pd.read_sql_query(query, self.conn)
481
482        return df

The get_query_to_df function takes a query as a string and returns the result as a pandas DataFrame based on the connection format.

Parameters
  • query: The query parameter in the get_query_to_df function is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame
  • limit: The limit parameter in the get_query_to_df function is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns

A pandas DataFrame is being returned by the get_query_to_df function.

def get_overview(self) -> None:
484    def get_overview(self) -> None:
485        """
486        The function prints the input, output, config, and dataframe of the current object
487        """
488        table_variants_from = self.get_table_variants(clause="from")
489        sql_columns = self.get_header_columns_as_sql()
490        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
491        df = self.get_query_to_df(sql_query_export)
492        log.info(
493            "Input:  "
494            + str(self.get_input())
495            + " ["
496            + str(str(self.get_input_format()))
497            + "]"
498        )
499        log.info(
500            "Output: "
501            + str(self.get_output())
502            + " ["
503            + str(str(self.get_output_format()))
504            + "]"
505        )
506        log.info("Config: ")
507        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
508            "\n"
509        ):
510            log.info("\t" + str(d))
511        log.info("Param: ")
512        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
513            "\n"
514        ):
515            log.info("\t" + str(d))
516        log.info("Sample list: " + str(self.get_header_sample_list()))
517        log.info("Dataframe: ")
518        for d in str(df).split("\n"):
519            log.info("\t" + str(d))
520
521        # garbage collector
522        del df
523        gc.collect()
524
525        return None

The function prints the input, output, config, and dataframe of the current object

def get_stats(self) -> dict:
527    def get_stats(self) -> dict:
528        """
529        The `get_stats` function calculates and returns various statistics of the current object,
530        including information about the input file, variants, samples, header fields, quality, and
531        SNVs/InDels.
532        :return: a dictionary containing various statistics of the current object. The dictionary has
533        the following structure:
534        """
535
536        # Log
537        log.info(f"Stats Calculation...")
538
539        # table varaints
540        table_variants_from = self.get_table_variants()
541
542        # stats dict
543        stats = {"Infos": {}}
544
545        ### File
546        input_file = self.get_input()
547        stats["Infos"]["Input file"] = input_file
548
549        # Header
550        header_infos = self.get_header().infos
551        header_formats = self.get_header().formats
552        header_infos_list = list(header_infos)
553        header_formats_list = list(header_formats)
554
555        ### Variants
556
557        stats["Variants"] = {}
558
559        # Variants by chr
560        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
561        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
562        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
563            by=["CHROM"], kind="quicksort"
564        )
565
566        # Total number of variants
567        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
568
569        # Calculate percentage
570        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
571            lambda x: (x / nb_of_variants)
572        )
573
574        stats["Variants"]["Number of variants by chromosome"] = (
575            nb_of_variants_by_chrom.to_dict(orient="index")
576        )
577
578        stats["Infos"]["Number of variants"] = int(nb_of_variants)
579
580        ### Samples
581
582        # Init
583        samples = {}
584        nb_of_samples = 0
585
586        # Check Samples
587        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
588            log.debug(f"Check samples...")
589            for sample in self.get_header_sample_list():
590                sql_query_samples = f"""
591                    SELECT  '{sample}' as sample,
592                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
593                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
594                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
595                    FROM {table_variants_from}
596                    WHERE (
597                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
598                        AND
599                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
600                      )
601                    GROUP BY genotype
602                    """
603                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
604                sample_genotype_count = sql_query_genotype_df["count"].sum()
605                if len(sql_query_genotype_df):
606                    nb_of_samples += 1
607                    samples[f"{sample} - {sample_genotype_count} variants"] = (
608                        sql_query_genotype_df.to_dict(orient="index")
609                    )
610
611            stats["Samples"] = samples
612            stats["Infos"]["Number of samples"] = nb_of_samples
613
614        # #
615        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
616        #     stats["Infos"]["Number of samples"] = nb_of_samples
617        # elif nb_of_samples:
618        #     stats["Infos"]["Number of samples"] = "not a VCF format"
619
620        ### INFO and FORMAT fields
621        header_types_df = {}
622        header_types_list = {
623            "List of INFO fields": header_infos,
624            "List of FORMAT fields": header_formats,
625        }
626        i = 0
627        for header_type in header_types_list:
628
629            header_type_infos = header_types_list.get(header_type)
630            header_infos_dict = {}
631
632            for info in header_type_infos:
633
634                i += 1
635                header_infos_dict[i] = {}
636
637                # ID
638                header_infos_dict[i]["id"] = info
639
640                # num
641                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
642                if header_type_infos[info].num in genotype_map.keys():
643                    header_infos_dict[i]["Number"] = genotype_map.get(
644                        header_type_infos[info].num
645                    )
646                else:
647                    header_infos_dict[i]["Number"] = header_type_infos[info].num
648
649                # type
650                if header_type_infos[info].type:
651                    header_infos_dict[i]["Type"] = header_type_infos[info].type
652                else:
653                    header_infos_dict[i]["Type"] = "."
654
655                # desc
656                if header_type_infos[info].desc != None:
657                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
658                else:
659                    header_infos_dict[i]["Description"] = ""
660
661            if len(header_infos_dict):
662                header_types_df[header_type] = pd.DataFrame.from_dict(
663                    header_infos_dict, orient="index"
664                ).to_dict(orient="index")
665
666        # Stats
667        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
668        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
669        stats["Header"] = header_types_df
670
671        ### QUAL
672        if "QUAL" in self.get_header_columns():
673            sql_query_qual = f"""
674                    SELECT
675                        avg(CAST(QUAL AS INTEGER)) AS Average,
676                        min(CAST(QUAL AS INTEGER)) AS Minimum,
677                        max(CAST(QUAL AS INTEGER)) AS Maximum,
678                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
679                        median(CAST(QUAL AS INTEGER)) AS Median,
680                        variance(CAST(QUAL AS INTEGER)) AS Variance
681                    FROM {table_variants_from}
682                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
683                    """
684
685            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
686            stats["Quality"] = {"Stats": qual}
687
688        ### SNV and InDel
689
690        sql_query_snv = f"""
691            
692            SELECT Type, count FROM (
693
694                    SELECT
695                        'Total' AS Type,
696                        count(*) AS count
697                    FROM {table_variants_from}
698
699                    UNION
700
701                    SELECT
702                        'MNV' AS Type,
703                        count(*) AS count
704                    FROM {table_variants_from}
705                    WHERE len(REF) > 1 AND len(ALT) > 1
706                    AND len(REF) = len(ALT)
707
708                    UNION
709
710                    SELECT
711                        'InDel' AS Type,
712                        count(*) AS count
713                    FROM {table_variants_from}
714                    WHERE len(REF) > 1 OR len(ALT) > 1
715                    AND len(REF) != len(ALT)
716                    
717                    UNION
718
719                    SELECT
720                        'SNV' AS Type,
721                        count(*) AS count
722                    FROM {table_variants_from}
723                    WHERE len(REF) = 1 AND len(ALT) = 1
724
725                )
726
727            ORDER BY count DESC
728
729                """
730        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
731
732        sql_query_snv_substitution = f"""
733                SELECT
734                    concat(REF, '>', ALT) AS 'Substitution',
735                    count(*) AS count
736                FROM {table_variants_from}
737                WHERE len(REF) = 1 AND len(ALT) = 1
738                GROUP BY REF, ALT
739                ORDER BY count(*) DESC
740                """
741        snv_substitution = (
742            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
743        )
744        stats["Variants"]["Counts"] = snv_indel
745        stats["Variants"]["Substitutions"] = snv_substitution
746
747        return stats

The get_stats function calculates and returns various statistics of the current object, including information about the input file, variants, samples, header fields, quality, and SNVs/InDels.

Returns

a dictionary containing various statistics of the current object. The dictionary has the following structure:

def stats_to_file(self, file: str = None) -> str:
749    def stats_to_file(self, file: str = None) -> str:
750        """
751        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
752        into a JSON object, and writes the JSON object to the specified file.
753
754        :param file: The `file` parameter is a string that represents the file path where the JSON data
755        will be written
756        :type file: str
757        :return: the name of the file that was written to.
758        """
759
760        # Get stats
761        stats = self.get_stats()
762
763        # Serializing json
764        json_object = json.dumps(stats, indent=4)
765
766        # Writing to sample.json
767        with open(file, "w") as outfile:
768            outfile.write(json_object)
769
770        return file

The function stats_to_file takes a file name as input, retrieves statistics, serializes them into a JSON object, and writes the JSON object to the specified file.

Parameters
  • file: The file parameter is a string that represents the file path where the JSON data will be written
Returns

the name of the file that was written to.

def print_stats(self, output_file: str = None, json_file: str = None) -> None:
772    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
773        """
774        The `print_stats` function generates a markdown file and prints the statistics contained in a
775        JSON file in a formatted manner.
776
777        :param output_file: The `output_file` parameter is a string that specifies the path and filename
778        of the output file where the stats will be printed in Markdown format. If no `output_file` is
779        provided, a temporary directory will be created and the stats will be saved in a file named
780        "stats.md" within that
781        :type output_file: str
782        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
783        file where the statistics will be saved. If no value is provided, a temporary directory will be
784        created and a default file name "stats.json" will be used
785        :type json_file: str
786        :return: The function `print_stats` does not return any value. It has a return type annotation
787        of `None`.
788        """
789
790        # Full path
791        output_file = full_path(output_file)
792        json_file = full_path(json_file)
793
794        with tempfile.TemporaryDirectory() as tmpdir:
795
796            # Files
797            if not output_file:
798                output_file = os.path.join(tmpdir, "stats.md")
799            if not json_file:
800                json_file = os.path.join(tmpdir, "stats.json")
801
802            # Create folders
803            if not os.path.exists(os.path.dirname(output_file)):
804                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
805            if not os.path.exists(os.path.dirname(json_file)):
806                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
807
808            # Create stats JSON file
809            stats_file = self.stats_to_file(file=json_file)
810
811            # Print stats file
812            with open(stats_file) as f:
813                stats = yaml.safe_load(f)
814
815            # Output
816            output_title = []
817            output_index = []
818            output = []
819
820            # Title
821            output_title.append("# HOWARD Stats")
822
823            # Index
824            output_index.append("## Index")
825
826            # Process sections
827            for section in stats:
828                infos = stats.get(section)
829                section_link = "#" + section.lower().replace(" ", "-")
830                output.append(f"## {section}")
831                output_index.append(f"- [{section}]({section_link})")
832
833                if len(infos):
834                    for info in infos:
835                        try:
836                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
837                            is_df = True
838                        except:
839                            try:
840                                df = pd.DataFrame.from_dict(
841                                    json.loads((infos.get(info))), orient="index"
842                                )
843                                is_df = True
844                            except:
845                                is_df = False
846                        if is_df:
847                            output.append(f"### {info}")
848                            info_link = "#" + info.lower().replace(" ", "-")
849                            output_index.append(f"   - [{info}]({info_link})")
850                            output.append(f"{df.to_markdown(index=False)}")
851                        else:
852                            output.append(f"- {info}: {infos.get(info)}")
853                else:
854                    output.append(f"NA")
855
856            # Write stats in markdown file
857            with open(output_file, "w") as fp:
858                for item in output_title:
859                    fp.write("%s\n" % item)
860                for item in output_index:
861                    fp.write("%s\n" % item)
862                for item in output:
863                    fp.write("%s\n" % item)
864
865            # Output stats in markdown
866            print("")
867            print("\n\n".join(output_title))
868            print("")
869            print("\n\n".join(output))
870            print("")
871
872        return None

The print_stats function generates a markdown file and prints the statistics contained in a JSON file in a formatted manner.

Parameters
  • output_file: The output_file parameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If no output_file is provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that
  • json_file: The json_file parameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns

The function print_stats does not return any value. It has a return type annotation of None.

def get_input(self) -> str:
874    def get_input(self) -> str:
875        """
876        It returns the value of the input variable.
877        :return: The input is being returned.
878        """
879        return self.input

It returns the value of the input variable.

Returns

The input is being returned.

def get_input_format(self, input_file: str = None) -> str:
881    def get_input_format(self, input_file: str = None) -> str:
882        """
883        This function returns the format of the input variable, either from the provided input file or
884        by prompting for input.
885
886        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
887        represents the file path of the input file. If no `input_file` is provided when calling the
888        method, it will default to `None`
889        :type input_file: str
890        :return: The format of the input variable is being returned.
891        """
892
893        if not input_file:
894            input_file = self.get_input()
895        input_format = get_file_format(input_file)
896        return input_format

This function returns the format of the input variable, either from the provided input file or by prompting for input.

Parameters
  • input_file: The input_file parameter in the get_input_format method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None
Returns

The format of the input variable is being returned.

def get_input_compressed(self, input_file: str = None) -> str:
898    def get_input_compressed(self, input_file: str = None) -> str:
899        """
900        The function `get_input_compressed` returns the format of the input variable after compressing
901        it.
902
903        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
904        that represents the file path of the input file. If no `input_file` is provided when calling the
905        method, it will default to `None` and the method will then call `self.get_input()` to
906        :type input_file: str
907        :return: The function `get_input_compressed` returns the compressed format of the input
908        variable.
909        """
910
911        if not input_file:
912            input_file = self.get_input()
913        input_compressed = get_file_compressed(input_file)
914        return input_compressed

The function get_input_compressed returns the format of the input variable after compressing it.

Parameters
  • input_file: The input_file parameter in the get_input_compressed method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None and the method will then call self.get_input() to
Returns

The function get_input_compressed returns the compressed format of the input variable.

def get_output(self) -> str:
916    def get_output(self) -> str:
917        """
918        It returns the output of the neuron.
919        :return: The output of the neural network.
920        """
921
922        return self.output

It returns the output of the neuron.

Returns

The output of the neural network.

def get_output_format(self, output_file: str = None) -> str:
924    def get_output_format(self, output_file: str = None) -> str:
925        """
926        The function `get_output_format` returns the format of the input variable or the output file if
927        provided.
928
929        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
930        that represents the file path of the output file. If no `output_file` is provided when calling
931        the method, it will default to the output obtained from the `get_output` method of the class
932        instance. The
933        :type output_file: str
934        :return: The format of the input variable is being returned.
935        """
936
937        if not output_file:
938            output_file = self.get_output()
939        output_format = get_file_format(output_file)
940
941        return output_format

The function get_output_format returns the format of the input variable or the output file if provided.

Parameters
  • output_file: The output_file parameter in the get_output_format method is a string that represents the file path of the output file. If no output_file is provided when calling the method, it will default to the output obtained from the get_output method of the class instance. The
Returns

The format of the input variable is being returned.

def get_config(self) -> dict:
943    def get_config(self) -> dict:
944        """
945        It returns the config
946        :return: The config variable is being returned.
947        """
948        return self.config

It returns the config

Returns

The config variable is being returned.

def get_param(self) -> dict:
950    def get_param(self) -> dict:
951        """
952        It returns the param
953        :return: The param variable is being returned.
954        """
955        return self.param

It returns the param

Returns

The param variable is being returned.

def get_connexion_db(self) -> str:
957    def get_connexion_db(self) -> str:
958        """
959        It returns the connexion_db attribute of the object
960        :return: The connexion_db is being returned.
961        """
962        return self.connexion_db

It returns the connexion_db attribute of the object

Returns

The connexion_db is being returned.

def get_prefix(self) -> str:
964    def get_prefix(self) -> str:
965        """
966        It returns the prefix of the object.
967        :return: The prefix is being returned.
968        """
969        return self.prefix

It returns the prefix of the object.

Returns

The prefix is being returned.

def get_table_variants(self, clause: str = 'select') -> str:
971    def get_table_variants(self, clause: str = "select") -> str:
972        """
973        This function returns the table_variants attribute of the object
974
975        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
976        defaults to select (optional)
977        :return: The table_variants attribute of the object.
978        """
979
980        # Access
981        access = self.get_config().get("access", None)
982
983        # Clauses "select", "where", "update"
984        if clause in ["select", "where", "update"]:
985            table_variants = self.table_variants
986        # Clause "from"
987        elif clause in ["from"]:
988            # For Read Only
989            if self.get_input_format() in ["parquet"] and access in ["RO"]:
990                input_file = self.get_input()
991                table_variants = f"'{input_file}' as variants"
992            # For Read Write
993            else:
994                table_variants = f"{self.table_variants} as variants"
995        else:
996            table_variants = self.table_variants
997        return table_variants

This function returns the table_variants attribute of the object

Parameters
  • clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns

The table_variants attribute of the object.

def get_tmp_dir(self) -> str:
 999    def get_tmp_dir(self) -> str:
1000        """
1001        The function `get_tmp_dir` returns the temporary directory path based on configuration
1002        parameters or a default path.
1003        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
1004        configuration, parameters, and a default value of "/tmp".
1005        """
1006
1007        return get_tmp(
1008            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
1009        )

The function get_tmp_dir returns the temporary directory path based on configuration parameters or a default path.

Returns

The get_tmp_dir method is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".

def get_connexion_type(self) -> str:
1011    def get_connexion_type(self) -> str:
1012        """
1013        If the connexion type is not in the list of allowed connexion types, raise a ValueError
1014
1015        :return: The connexion type is being returned.
1016        """
1017        return self.get_config().get("connexion_type", "memory")

If the connexion type is not in the list of allowed connexion types, raise a ValueError

Returns

The connexion type is being returned.

def get_connexion(self):
1019    def get_connexion(self):
1020        """
1021        It returns the connection object
1022
1023        :return: The connection object.
1024        """
1025        return self.conn

It returns the connection object

Returns

The connection object.

def close_connexion(self) -> None:
1027    def close_connexion(self) -> None:
1028        """
1029        This function closes the connection to the database.
1030        :return: The connection is being closed.
1031        """
1032        return self.conn.close()

This function closes the connection to the database.

Returns

The connection is being closed.

def get_header(self, type: str = 'vcf'):
1034    def get_header(self, type: str = "vcf"):
1035        """
1036        This function returns the header of the VCF file as a list of strings
1037
1038        :param type: the type of header you want to get, defaults to vcf (optional)
1039        :return: The header of the vcf file.
1040        """
1041
1042        if self.header_vcf:
1043            if type == "vcf":
1044                return self.header_vcf
1045            elif type == "list":
1046                return self.header_list
1047        else:
1048            if type == "vcf":
1049                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
1050                return header
1051            elif type == "list":
1052                return vcf_required

This function returns the header of the VCF file as a list of strings

Parameters
  • type: the type of header you want to get, defaults to vcf (optional)
Returns

The header of the vcf file.

def get_header_length(self, file: str = None) -> int:
1054    def get_header_length(self, file: str = None) -> int:
1055        """
1056        The function `get_header_length` returns the length of the header list, excluding the #CHROM
1057        line.
1058
1059        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
1060        header file. If this argument is provided, the function will read the header from the specified
1061        file and return the length of the header list minus 1 (to exclude the #CHROM line)
1062        :type file: str
1063        :return: the length of the header list, excluding the #CHROM line.
1064        """
1065
1066        if file:
1067            return len(self.read_vcf_header_file(file=file)) - 1
1068        elif self.get_header(type="list"):
1069            return len(self.get_header(type="list")) - 1
1070        else:
1071            return 0

The function get_header_length returns the length of the header list, excluding the #CHROM line.

Parameters
  • file: The file parameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns

the length of the header list, excluding the #CHROM line.

def get_header_columns(self) -> str:
1073    def get_header_columns(self) -> str:
1074        """
1075        This function returns the header list of a VCF
1076
1077        :return: The length of the header list.
1078        """
1079        if self.get_header():
1080            return self.get_header(type="list")[-1]
1081        else:
1082            return ""

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_list(self) -> list:
1084    def get_header_columns_as_list(self) -> list:
1085        """
1086        This function returns the header list of a VCF
1087
1088        :return: The length of the header list.
1089        """
1090        if self.get_header():
1091            return self.get_header_columns().strip().split("\t")
1092        else:
1093            return []

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_sql(self) -> str:
1095    def get_header_columns_as_sql(self) -> str:
1096        """
1097        This function retruns header length (without #CHROM line)
1098
1099        :return: The length of the header list.
1100        """
1101        sql_column_list = []
1102        for col in self.get_header_columns_as_list():
1103            sql_column_list.append(f'"{col}"')
1104        return ",".join(sql_column_list)

This function retruns header length (without #CHROM line)

Returns

The length of the header list.

def get_header_sample_list(self) -> list:
1106    def get_header_sample_list(self) -> list:
1107        """
1108        This function retruns header length (without #CHROM line)
1109
1110        :return: The length of the header list.
1111        """
1112        return self.header_vcf.samples

This function retruns header length (without #CHROM line)

Returns

The length of the header list.

def get_verbose(self) -> bool:
1114    def get_verbose(self) -> bool:
1115        """
1116        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
1117        exist
1118
1119        :return: The value of the key "verbose" in the config dictionary.
1120        """
1121        return self.get_config().get("verbose", False)

It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist

Returns

The value of the key "verbose" in the config dictionary.

def get_connexion_format(self) -> str:
1123    def get_connexion_format(self) -> str:
1124        """
1125        It returns the connexion format of the object.
1126        :return: The connexion_format is being returned.
1127        """
1128        connexion_format = self.connexion_format
1129        if connexion_format not in ["duckdb", "sqlite"]:
1130            log.error(f"Unknown connexion format {connexion_format}")
1131            raise ValueError(f"Unknown connexion format {connexion_format}")
1132        else:
1133            return connexion_format

It returns the connexion format of the object.

Returns

The connexion_format is being returned.

def insert_file_to_table( self, file, columns: str, header_len: int = 0, sep: str = '\t', chunksize: int = 1000000) -> None:
1135    def insert_file_to_table(
1136        self,
1137        file,
1138        columns: str,
1139        header_len: int = 0,
1140        sep: str = "\t",
1141        chunksize: int = 1000000,
1142    ) -> None:
1143        """
1144        The function reads a file in chunks and inserts each chunk into a table based on the specified
1145        database format.
1146
1147        :param file: The `file` parameter is the file that you want to load into a table. It should be
1148        the path to the file on your system
1149        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
1150        should contain the names of the columns in the table where the data will be inserted. The column
1151        names should be separated by commas within the string. For example, if you have columns named
1152        "id", "name
1153        :type columns: str
1154        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
1155        the number of lines to skip at the beginning of the file before reading the actual data. This
1156        parameter allows you to skip any header information present in the file before processing the
1157        data, defaults to 0
1158        :type header_len: int (optional)
1159        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
1160        separator character that is used in the file being read. In this case, the default separator is
1161        set to `\t`, which represents a tab character. You can change this parameter to a different
1162        separator character if, defaults to \t
1163        :type sep: str (optional)
1164        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
1165        when processing the file in chunks. In the provided code snippet, the default value for
1166        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
1167        to 1000000
1168        :type chunksize: int (optional)
1169        """
1170
1171        # Config
1172        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
1173        connexion_format = self.get_connexion_format()
1174
1175        log.debug("chunksize: " + str(chunksize))
1176
1177        if chunksize:
1178            for chunk in pd.read_csv(
1179                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
1180            ):
1181                if connexion_format in ["duckdb"]:
1182                    sql_insert_into = (
1183                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
1184                    )
1185                    self.conn.execute(sql_insert_into)
1186                elif connexion_format in ["sqlite"]:
1187                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)

The function reads a file in chunks and inserts each chunk into a table based on the specified database format.

Parameters
  • file: The file parameter is the file that you want to load into a table. It should be the path to the file on your system
  • columns: The columns parameter in the insert_file_to_table function is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name
  • header_len: The header_len parameter in the insert_file_to_table function specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0
  • sep: The sep parameter in the insert_file_to_table function is used to specify the separator character that is used in the file being read. In this case, the default separator is set to , which represents a tab character. You can change this parameter to a different separator character if, defaults to
  • chunksize: The chunksize parameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value for chunksize is set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
def load_data( self, input_file: str = None, drop_variants_table: bool = False, sample_size: int = 20480) -> None:
1189    def load_data(
1190        self,
1191        input_file: str = None,
1192        drop_variants_table: bool = False,
1193        sample_size: int = 20480,
1194    ) -> None:
1195        """
1196        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
1197        table before loading the data and specify a sample size.
1198
1199        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
1200        table
1201        :type input_file: str
1202        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
1203        determines whether the variants table should be dropped before loading the data. If set to
1204        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
1205        not be dropped, defaults to False
1206        :type drop_variants_table: bool (optional)
1207        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
1208        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
1209        20480
1210        :type sample_size: int (optional)
1211        """
1212
1213        log.info("Loading...")
1214
1215        # change input file
1216        if input_file:
1217            self.set_input(input_file)
1218            self.set_header()
1219
1220        # drop variants table
1221        if drop_variants_table:
1222            self.drop_variants_table()
1223
1224        # get table variants
1225        table_variants = self.get_table_variants()
1226
1227        # Access
1228        access = self.get_config().get("access", None)
1229        log.debug(f"access: {access}")
1230
1231        # Input format and compress
1232        input_format = self.get_input_format()
1233        input_compressed = self.get_input_compressed()
1234        log.debug(f"input_format: {input_format}")
1235        log.debug(f"input_compressed: {input_compressed}")
1236
1237        # input_compressed_format
1238        if input_compressed:
1239            input_compressed_format = "gzip"
1240        else:
1241            input_compressed_format = "none"
1242        log.debug(f"input_compressed_format: {input_compressed_format}")
1243
1244        # Connexion format
1245        connexion_format = self.get_connexion_format()
1246
1247        # Sample size
1248        if not sample_size:
1249            sample_size = -1
1250        log.debug(f"sample_size: {sample_size}")
1251
1252        # Load data
1253        log.debug(f"Load Data from {input_format}")
1254
1255        # DuckDB connexion
1256        if connexion_format in ["duckdb"]:
1257
1258            # Database already exists
1259            if self.input_format in ["db", "duckdb"]:
1260
1261                if connexion_format in ["duckdb"]:
1262                    log.debug(f"Input file format '{self.input_format}' duckDB")
1263                else:
1264                    log.error(
1265                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1266                    )
1267                    raise ValueError(
1268                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1269                    )
1270
1271            # Load from existing database format
1272            else:
1273
1274                try:
1275                    # Create Table or View
1276                    database = Database(database=self.input)
1277                    sql_from = database.get_sql_from(sample_size=sample_size)
1278
1279                    if access in ["RO"]:
1280                        sql_load = (
1281                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
1282                        )
1283                    else:
1284                        sql_load = (
1285                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
1286                        )
1287                    self.conn.execute(sql_load)
1288
1289                except:
1290                    # Format not available
1291                    log.error(f"Input file format '{self.input_format}' not available")
1292                    raise ValueError(
1293                        f"Input file format '{self.input_format}' not available"
1294                    )
1295
1296        # SQLite connexion
1297        elif connexion_format in ["sqlite"] and input_format in [
1298            "vcf",
1299            "tsv",
1300            "csv",
1301            "psv",
1302        ]:
1303
1304            # Main structure
1305            structure = {
1306                "#CHROM": "VARCHAR",
1307                "POS": "INTEGER",
1308                "ID": "VARCHAR",
1309                "REF": "VARCHAR",
1310                "ALT": "VARCHAR",
1311                "QUAL": "VARCHAR",
1312                "FILTER": "VARCHAR",
1313                "INFO": "VARCHAR",
1314            }
1315
1316            # Strcuture with samples
1317            structure_complete = structure
1318            if self.get_header_sample_list():
1319                structure["FORMAT"] = "VARCHAR"
1320                for sample in self.get_header_sample_list():
1321                    structure_complete[sample] = "VARCHAR"
1322
1323            # Columns list for create and insert
1324            sql_create_table_columns = []
1325            sql_create_table_columns_list = []
1326            for column in structure_complete:
1327                column_type = structure_complete[column]
1328                sql_create_table_columns.append(
1329                    f'"{column}" {column_type} default NULL'
1330                )
1331                sql_create_table_columns_list.append(f'"{column}"')
1332
1333            # Create database
1334            log.debug(f"Create Table {table_variants}")
1335            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
1336            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
1337            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
1338            self.conn.execute(sql_create_table)
1339
1340            # chunksize define length of file chunk load file
1341            chunksize = 100000
1342
1343            # delimiter
1344            delimiter = file_format_delimiters.get(input_format, "\t")
1345
1346            # Load the input file
1347            with open(self.input, "rt") as input_file:
1348
1349                # Use the appropriate file handler based on the input format
1350                if input_compressed:
1351                    input_file = bgzf.open(self.input, "rt")
1352                if input_format in ["vcf"]:
1353                    header_len = self.get_header_length()
1354                else:
1355                    header_len = 0
1356
1357                # Insert the file contents into a table
1358                self.insert_file_to_table(
1359                    input_file,
1360                    columns=sql_create_table_columns_list_sql,
1361                    header_len=header_len,
1362                    sep=delimiter,
1363                    chunksize=chunksize,
1364                )
1365
1366        else:
1367            log.error(
1368                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1369            )
1370            raise ValueError(
1371                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1372            )
1373
1374        # Explode INFOS fields into table fields
1375        if self.get_explode_infos():
1376            self.explode_infos(
1377                prefix=self.get_explode_infos_prefix(),
1378                fields=self.get_explode_infos_fields(),
1379                force=True,
1380            )
1381
1382        # Create index after insertion
1383        self.create_indexes()

The load_data function reads a VCF file and inserts it into a table, with options to drop the table before loading the data and specify a sample size.

Parameters
  • input_file: The path to the input file. This is the VCF file that will be loaded into the table
  • drop_variants_table: The drop_variants_table parameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set to True, the variants table will be dropped. If set to False (default), the variants table will not be dropped, defaults to False
  • sample_size: The sample_size parameter determines the number of rows to be sampled from the input file. If it is set to None, the default value of 20480 will be used, defaults to 20480
def get_explode_infos(self) -> bool:
1385    def get_explode_infos(self) -> bool:
1386        """
1387        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
1388        to False if it is not set.
1389        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
1390        value. If the parameter is not present, it will return False.
1391        """
1392
1393        return self.get_param().get("explode", {}).get("explode_infos", False)

The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting to False if it is not set.

Returns

The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.

def get_explode_infos_fields( self, explode_infos_fields: str = None, remove_fields_not_in_header: bool = False) -> list:
1395    def get_explode_infos_fields(
1396        self,
1397        explode_infos_fields: str = None,
1398        remove_fields_not_in_header: bool = False,
1399    ) -> list:
1400        """
1401        The `get_explode_infos_fields` function returns a list of exploded information fields based on
1402        the input parameter `explode_infos_fields`.
1403
1404        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
1405        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
1406        comma-separated list of field names to explode
1407        :type explode_infos_fields: str
1408        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
1409        flag that determines whether to remove fields that are not present in the header. If it is set
1410        to `True`, any field that is not in the header will be excluded from the list of exploded
1411        information fields. If it is set to `, defaults to False
1412        :type remove_fields_not_in_header: bool (optional)
1413        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
1414        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
1415        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
1416        Otherwise, it returns a list of exploded information fields after removing any spaces and
1417        splitting the string by commas.
1418        """
1419
1420        # If no fields, get it in param
1421        if not explode_infos_fields:
1422            explode_infos_fields = (
1423                self.get_param().get("explode", {}).get("explode_infos_fields", None)
1424            )
1425
1426        # If no fields, defined as all fields in header using keyword
1427        if not explode_infos_fields:
1428            explode_infos_fields = "*"
1429
1430        # If fields list not empty
1431        if explode_infos_fields:
1432
1433            # Input fields list
1434            if isinstance(explode_infos_fields, str):
1435                fields_input = explode_infos_fields.split(",")
1436            elif isinstance(explode_infos_fields, list):
1437                fields_input = explode_infos_fields
1438            else:
1439                fields_input = []
1440
1441            # Fields list without * keyword
1442            fields_without_all = fields_input.copy()
1443            if "*".casefold() in (item.casefold() for item in fields_without_all):
1444                fields_without_all.remove("*")
1445
1446            # Fields in header
1447            fields_in_header = sorted(list(set(self.get_header().infos)))
1448
1449            # Construct list of fields
1450            fields_output = []
1451            for field in fields_input:
1452
1453                # Strip field
1454                field = field.strip()
1455
1456                # format keyword * in regex
1457                if field.upper() in ["*"]:
1458                    field = ".*"
1459
1460                # Find all fields with pattern
1461                r = re.compile(field)
1462                fields_search = sorted(list(filter(r.match, fields_in_header)))
1463
1464                # Remove fields input from search
1465                if fields_search != [field]:
1466                    fields_search = sorted(
1467                        list(set(fields_search).difference(fields_input))
1468                    )
1469
1470                # If field is not in header (avoid not well formatted header)
1471                if not fields_search and not remove_fields_not_in_header:
1472                    fields_search = [field]
1473
1474                # Add found fields
1475                for new_field in fields_search:
1476                    # Add field, if not already exists, and if it is in header (if asked)
1477                    if (
1478                        new_field not in fields_output
1479                        and (
1480                            not remove_fields_not_in_header
1481                            or new_field in fields_in_header
1482                        )
1483                        and new_field not in [".*"]
1484                    ):
1485                        fields_output.append(new_field)
1486
1487            return fields_output
1488
1489        else:
1490
1491            return []

The get_explode_infos_fields function returns a list of exploded information fields based on the input parameter explode_infos_fields.

Parameters
  • explode_infos_fields: The explode_infos_fields parameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode
  • remove_fields_not_in_header: The parameter remove_fields_not_in_header is a boolean flag that determines whether to remove fields that are not present in the header. If it is set to True, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns

The function get_explode_infos_fields returns a list of exploded information fields. If the explode_infos_fields parameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.

def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1493    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1494        """
1495        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
1496        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
1497        not provided.
1498
1499        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
1500        prefix to be used for exploding or expanding information
1501        :type explode_infos_prefix: str
1502        :return: the value of the variable `explode_infos_prefix`.
1503        """
1504
1505        if not explode_infos_prefix:
1506            explode_infos_prefix = (
1507                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
1508            )
1509
1510        return explode_infos_prefix

The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is not provided.

Parameters
  • explode_infos_prefix: The parameter explode_infos_prefix is a string that specifies a prefix to be used for exploding or expanding information
Returns

the value of the variable explode_infos_prefix.

def add_column( self, table_name, column_name, column_type, default_value=None, drop: bool = False) -> dict:
1512    def add_column(
1513        self,
1514        table_name,
1515        column_name,
1516        column_type,
1517        default_value=None,
1518        drop: bool = False,
1519    ) -> dict:
1520        """
1521        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
1522        doesn't already exist.
1523
1524        :param table_name: The name of the table to which you want to add a column
1525        :param column_name: The parameter "column_name" is the name of the column that you want to add
1526        to the table
1527        :param column_type: The `column_type` parameter specifies the data type of the column that you
1528        want to add to the table. It should be a string that represents the desired data type, such as
1529        "INTEGER", "TEXT", "REAL", etc
1530        :param default_value: The `default_value` parameter is an optional parameter that specifies the
1531        default value for the newly added column. If a default value is provided, it will be assigned to
1532        the column for any existing rows that do not have a value for that column
1533        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
1534        if it already exists in the table. If `drop` is set to `True`, the function will drop the
1535        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
1536        to False
1537        :type drop: bool (optional)
1538        :return: a boolean value indicating whether the column was successfully added to the table.
1539        """
1540
1541        # added
1542        added = False
1543        dropped = False
1544
1545        # Check if the column already exists in the table
1546        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1547        columns = self.get_query_to_df(query).columns.tolist()
1548        if column_name in columns:
1549            log.debug(
1550                f"The {column_name} column already exists in the {table_name} table"
1551            )
1552            if drop:
1553                self.drop_column(table_name=table_name, column_name=column_name)
1554                dropped = True
1555            else:
1556                return None
1557        else:
1558            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1559
1560        # Add column in table
1561        add_column_query = (
1562            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
1563        )
1564        if default_value is not None:
1565            add_column_query += f" DEFAULT {default_value}"
1566        self.execute_query(add_column_query)
1567        added = not dropped
1568        log.debug(
1569            f"The {column_name} column was successfully added to the {table_name} table"
1570        )
1571
1572        if added:
1573            added_column = {
1574                "table_name": table_name,
1575                "column_name": column_name,
1576                "column_type": column_type,
1577                "default_value": default_value,
1578            }
1579        else:
1580            added_column = None
1581
1582        return added_column

The add_column function adds a column to a SQLite or DuckDB table with a default value if it doesn't already exist.

Parameters
  • table_name: The name of the table to which you want to add a column
  • column_name: The parameter "column_name" is the name of the column that you want to add to the table
  • column_type: The column_type parameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc
  • default_value: The default_value parameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column
  • drop: The drop parameter is a boolean flag that determines whether to drop the column if it already exists in the table. If drop is set to True, the function will drop the existing column before adding the new column. If drop is set to False (default),, defaults to False
Returns

a boolean value indicating whether the column was successfully added to the table.

def drop_column( self, column: dict = None, table_name: str = None, column_name: str = None) -> bool:
1584    def drop_column(
1585        self, column: dict = None, table_name: str = None, column_name: str = None
1586    ) -> bool:
1587        """
1588        The `drop_column` function drops a specified column from a given table in a database and returns
1589        True if the column was successfully dropped, and False if the column does not exist in the
1590        table.
1591
1592        :param column: The `column` parameter is a dictionary that contains information about the column
1593        you want to drop. It has two keys:
1594        :type column: dict
1595        :param table_name: The `table_name` parameter is the name of the table from which you want to
1596        drop a column
1597        :type table_name: str
1598        :param column_name: The `column_name` parameter is the name of the column that you want to drop
1599        from the table
1600        :type column_name: str
1601        :return: a boolean value. It returns True if the column was successfully dropped from the table,
1602        and False if the column does not exist in the table.
1603        """
1604
1605        # Find column infos
1606        if column:
1607            if isinstance(column, dict):
1608                table_name = column.get("table_name", None)
1609                column_name = column.get("column_name", None)
1610            elif isinstance(column, str):
1611                table_name = self.get_table_variants()
1612                column_name = column
1613            else:
1614                table_name = None
1615                column_name = None
1616
1617        if not table_name and not column_name:
1618            return False
1619
1620        # Removed
1621        removed = False
1622
1623        # Check if the column already exists in the table
1624        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1625        columns = self.get_query_to_df(query).columns.tolist()
1626        if column_name in columns:
1627            log.debug(f"The {column_name} column exists in the {table_name} table")
1628        else:
1629            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1630            return False
1631
1632        # Add column in table # ALTER TABLE integers DROP k
1633        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
1634        self.execute_query(add_column_query)
1635        removed = True
1636        log.debug(
1637            f"The {column_name} column was successfully dropped to the {table_name} table"
1638        )
1639
1640        return removed

The drop_column function drops a specified column from a given table in a database and returns True if the column was successfully dropped, and False if the column does not exist in the table.

Parameters
  • column: The column parameter is a dictionary that contains information about the column you want to drop. It has two keys:
  • table_name: The table_name parameter is the name of the table from which you want to drop a column
  • column_name: The column_name parameter is the name of the column that you want to drop from the table
Returns

a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.

def explode_infos( self, prefix: str = None, create_index: bool = False, fields: list = None, force: bool = False, proccess_all_fields_together: bool = False) -> list:
1642    def explode_infos(
1643        self,
1644        prefix: str = None,
1645        create_index: bool = False,
1646        fields: list = None,
1647        force: bool = False,
1648        proccess_all_fields_together: bool = False,
1649    ) -> list:
1650        """
1651        The `explode_infos` function takes a VCF file and explodes the INFO fields into individual
1652        columns, returning a list of added columns.
1653
1654        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
1655        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
1656        `self.get_explode_infos_prefix()` as the prefix
1657        :type prefix: str
1658        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
1659        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
1660        `False`, indexes will not be created. The default value is `False`, defaults to False
1661        :type create_index: bool (optional)
1662        :param fields: The `fields` parameter is a list of INFO fields that you want to explode into
1663        individual columns. If this parameter is not provided, all INFO fields will be exploded
1664        :type fields: list
1665        :param force: The `force` parameter is a boolean flag that determines whether to drop and
1666        recreate the column if it already exists in the table. If `force` is set to `True`, the column
1667        will be dropped and recreated. If `force` is set to `False`, the column will not be dropped,
1668        defaults to False
1669        :type force: bool (optional)
1670        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
1671        flag that determines whether to process all the INFO fields together or individually. If set to
1672        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
1673        be processed individually, defaults to False
1674        :type proccess_all_fields_together: bool (optional)
1675        :return: The function `explode_infos` returns a list of added columns.
1676        """
1677
1678        # drop indexes
1679        self.drop_indexes()
1680
1681        # connexion format
1682        connexion_format = self.get_connexion_format()
1683
1684        # Access
1685        access = self.get_config().get("access", None)
1686
1687        # Added columns
1688        added_columns = []
1689
1690        if access not in ["RO"]:
1691
1692            # prefix
1693            if prefix in [None, True] or not isinstance(prefix, str):
1694                if self.get_explode_infos_prefix() not in [None, True]:
1695                    prefix = self.get_explode_infos_prefix()
1696                else:
1697                    prefix = "INFO/"
1698
1699            # table variants
1700            table_variants = self.get_table_variants(clause="select")
1701
1702            # extra infos
1703            try:
1704                extra_infos = self.get_extra_infos()
1705            except:
1706                extra_infos = []
1707
1708            # Header infos
1709            header_infos = self.get_header().infos
1710
1711            log.debug(
1712                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
1713            )
1714
1715            sql_info_alter_table_array = []
1716
1717            # Info fields to check
1718            fields_list = list(header_infos)
1719            if fields:
1720                fields_list += fields
1721            fields_list = set(fields_list)
1722
1723            # If no fields
1724            if not fields:
1725                fields = []
1726
1727            # Translate fields if patterns
1728            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
1729
1730            for info in fields:
1731
1732                info_id_sql = prefix + info
1733
1734                if (
1735                    info in fields_list
1736                    or prefix + info in fields_list
1737                    or info in extra_infos
1738                ):
1739
1740                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
1741
1742                    if info in header_infos:
1743                        info_type = header_infos[info].type
1744                        info_num = header_infos[info].num
1745                    else:
1746                        info_type = "String"
1747                        info_num = 0
1748
1749                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
1750                    if info_num != 1:
1751                        type_sql = "VARCHAR"
1752
1753                    # Add field
1754                    added_column = self.add_column(
1755                        table_name=table_variants,
1756                        column_name=info_id_sql,
1757                        column_type=type_sql,
1758                        default_value="null",
1759                        drop=force,
1760                    )
1761
1762                    if added_column:
1763                        added_columns.append(added_column)
1764
1765                    if added_column or force:
1766
1767                        # add field to index
1768                        self.index_additionnal_fields.append(info_id_sql)
1769
1770                        # Update field array
1771                        if connexion_format in ["duckdb"]:
1772                            update_info_field = f"""
1773                            "{info_id_sql}" =
1774                                CASE
1775                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
1776                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
1777                                END
1778                            """
1779                        elif connexion_format in ["sqlite"]:
1780                            update_info_field = f"""
1781                                "{info_id_sql}" =
1782                                    CASE
1783                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
1784                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
1785                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
1786                                    END
1787                            """
1788
1789                        sql_info_alter_table_array.append(update_info_field)
1790
1791            if sql_info_alter_table_array:
1792
1793                # By chromosomes
1794                try:
1795                    chromosomes_list = list(
1796                        self.get_query_to_df(
1797                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
1798                        )["#CHROM"]
1799                    )
1800                except:
1801                    chromosomes_list = [None]
1802
1803                for chrom in chromosomes_list:
1804                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
1805
1806                    # Where clause
1807                    where_clause = ""
1808                    if chrom and len(chromosomes_list) > 1:
1809                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
1810
1811                    # Update table
1812                    if proccess_all_fields_together:
1813                        sql_info_alter_table_array_join = ", ".join(
1814                            sql_info_alter_table_array
1815                        )
1816                        if sql_info_alter_table_array_join:
1817                            sql_info_alter_table = f"""
1818                                UPDATE {table_variants}
1819                                SET {sql_info_alter_table_array_join}
1820                                {where_clause}
1821                                """
1822                            log.debug(
1823                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
1824                            )
1825                            # log.debug(sql_info_alter_table)
1826                            self.conn.execute(sql_info_alter_table)
1827                    else:
1828                        sql_info_alter_num = 0
1829                        for sql_info_alter in sql_info_alter_table_array:
1830                            sql_info_alter_num += 1
1831                            sql_info_alter_table = f"""
1832                                UPDATE {table_variants}
1833                                SET {sql_info_alter}
1834                                {where_clause}
1835                                """
1836                            log.debug(
1837                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
1838                            )
1839                            # log.debug(sql_info_alter_table)
1840                            self.conn.execute(sql_info_alter_table)
1841
1842        # create indexes
1843        if create_index:
1844            self.create_indexes()
1845
1846        return added_columns

The explode_infos function takes a VCF file and explodes the INFO fields into individual columns, returning a list of added columns.

Parameters
  • prefix: The prefix parameter is a string that is used as a prefix for the exploded INFO fields. If the prefix is not provided or is set to None, the function will use the value of self.get_explode_infos_prefix() as the prefix
  • create_index: The create_index parameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set to True, indexes will be created; if set to False, indexes will not be created. The default value is False, defaults to False
  • fields: The fields parameter is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded
  • force: The force parameter is a boolean flag that determines whether to drop and recreate the column if it already exists in the table. If force is set to True, the column will be dropped and recreated. If force is set to False, the column will not be dropped, defaults to False
  • proccess_all_fields_together: The proccess_all_fields_together parameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set to True, all the INFO fields will be processed together. If set to False, each INFO field will be processed individually, defaults to False
Returns

The function explode_infos returns a list of added columns.

def create_indexes(self) -> None:
1848    def create_indexes(self) -> None:
1849        """
1850        Create indexes on the table after insertion
1851        """
1852
1853        # Access
1854        access = self.get_config().get("access", None)
1855
1856        # get table variants
1857        table_variants = self.get_table_variants("FROM")
1858
1859        if self.get_indexing() and access not in ["RO"]:
1860            # Create index
1861            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
1862            self.conn.execute(sql_create_table_index)
1863            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
1864            self.conn.execute(sql_create_table_index)
1865            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
1866            self.conn.execute(sql_create_table_index)
1867            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
1868            self.conn.execute(sql_create_table_index)
1869            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
1870            self.conn.execute(sql_create_table_index)
1871            for field in self.index_additionnal_fields:
1872                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
1873                self.conn.execute(sql_create_table_index)

Create indexes on the table after insertion

def drop_indexes(self) -> None:
1875    def drop_indexes(self) -> None:
1876        """
1877        Create indexes on the table after insertion
1878        """
1879
1880        # Access
1881        access = self.get_config().get("access", None)
1882
1883        # get table variants
1884        table_variants = self.get_table_variants("FROM")
1885
1886        # Get database format
1887        connexion_format = self.get_connexion_format()
1888
1889        if access not in ["RO"]:
1890            if connexion_format in ["duckdb"]:
1891                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
1892            elif connexion_format in ["sqlite"]:
1893                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
1894
1895            list_indexes = self.conn.execute(sql_list_indexes)
1896            index_names = [row[0] for row in list_indexes.fetchall()]
1897            for index in index_names:
1898                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
1899                self.conn.execute(sql_drop_table_index)

Create indexes on the table after insertion

def read_vcf_header(self, f) -> list:
1901    def read_vcf_header(self, f) -> list:
1902        """
1903        It reads the header of a VCF file and returns a list of the header lines
1904
1905        :param f: the file object
1906        :return: The header lines of the VCF file.
1907        """
1908
1909        header_list = []
1910        for line in f:
1911            header_list.append(line)
1912            if line.startswith("#CHROM"):
1913                break
1914        return header_list

It reads the header of a VCF file and returns a list of the header lines

Parameters
  • f: the file object
Returns

The header lines of the VCF file.

def read_vcf_header_file(self, file: str = None) -> list:
1916    def read_vcf_header_file(self, file: str = None) -> list:
1917        """
1918        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
1919        uncompressed files.
1920
1921        :param file: The `file` parameter is a string that represents the path to the VCF header file
1922        that you want to read. It is an optional parameter, so if you don't provide a value, it will
1923        default to `None`
1924        :type file: str
1925        :return: The function `read_vcf_header_file` returns a list.
1926        """
1927
1928        if self.get_input_compressed(input_file=file):
1929            with bgzf.open(file, "rt") as f:
1930                return self.read_vcf_header(f=f)
1931        else:
1932            with open(file, "rt") as f:
1933                return self.read_vcf_header(f=f)

The read_vcf_header_file function reads the header of a VCF file, handling both compressed and uncompressed files.

Parameters
  • file: The file parameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default to None
Returns

The function read_vcf_header_file returns a list.

def execute_query(self, query: str):
1935    def execute_query(self, query: str):
1936        """
1937        It takes a query as an argument, executes it, and returns the results
1938
1939        :param query: The query to be executed
1940        :return: The result of the query is being returned.
1941        """
1942        if query:
1943            return self.conn.execute(query)  # .fetchall()
1944        else:
1945            return None

It takes a query as an argument, executes it, and returns the results

Parameters
  • query: The query to be executed
Returns

The result of the query is being returned.

def export_output( self, output_file: str | None = None, output_header: str | None = None, export_header: bool = True, query: str | None = None, parquet_partitions: list | None = None, chunk_size: int | None = None, threads: int | None = None, sort: bool = False, index: bool = False, order_by: str | None = None) -> bool:
1947    def export_output(
1948        self,
1949        output_file: str | None = None,
1950        output_header: str | None = None,
1951        export_header: bool = True,
1952        query: str | None = None,
1953        parquet_partitions: list | None = None,
1954        chunk_size: int | None = None,
1955        threads: int | None = None,
1956        sort: bool = False,
1957        index: bool = False,
1958        order_by: str | None = None,
1959    ) -> bool:
1960        """
1961        The `export_output` function exports data from a VCF file to a specified output file in various
1962        formats, including VCF, CSV, TSV, PSV, and Parquet.
1963
1964        :param output_file: The `output_file` parameter is a string that specifies the name of the
1965        output file to be generated by the function. This is where the exported data will be saved
1966        :type output_file: str
1967        :param output_header: The `output_header` parameter is a string that specifies the name of the
1968        file where the header of the VCF file will be exported. If this parameter is not provided, the
1969        header will be exported to a file with the same name as the `output_file` parameter, but with
1970        the extension "
1971        :type output_header: str
1972        :param export_header: The `export_header` parameter is a boolean flag that determines whether
1973        the header of a VCF file should be exported to a separate file or not. If `export_header` is
1974        True, the header will be exported to a file. If `export_header` is False, the header will not
1975        be, defaults to True, if output format is not VCF
1976        :type export_header: bool (optional)
1977        :param query: The `query` parameter is an optional SQL query that can be used to filter and
1978        select specific data from the VCF file before exporting it. If provided, only the data that
1979        matches the query will be exported
1980        :type query: str
1981        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
1982        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
1983        organize data in a hierarchical directory structure based on the values of one or more columns.
1984        This can improve query performance when working with large datasets
1985        :type parquet_partitions: list
1986        :param chunk_size: The `chunk_size` parameter specifies the number of
1987        records in batch when exporting data in Parquet format. This parameter is used for
1988        partitioning the Parquet file into multiple files.
1989        :type chunk_size: int
1990        :param threads: The `threads` parameter is an optional parameter that specifies the number of
1991        threads to be used during the export process. It determines the level of parallelism and can
1992        improve the performance of the export operation. If not provided, the function will use the
1993        default number of threads
1994        :type threads: int
1995        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
1996        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
1997        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
1998        False
1999        :type sort: bool (optional)
2000        :param index: The `index` parameter is a boolean flag that determines whether an index should be
2001        created on the output file. If `index` is True, an index will be created. If `index` is False,
2002        no index will be created. The default value is False, defaults to False
2003        :type index: bool (optional)
2004        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
2005        sorting the output file. This parameter is only applicable when exporting data in VCF format
2006        :type order_by: str
2007        :return: a boolean value. It checks if the output file exists and returns True if it does, or
2008        None if it doesn't.
2009        """
2010
2011        # Log
2012        log.info("Exporting...")
2013
2014        # Full path
2015        output_file = full_path(output_file)
2016        output_header = full_path(output_header)
2017
2018        # Config
2019        config = self.get_config()
2020
2021        # Param
2022        param = self.get_param()
2023
2024        # Tmp files to remove
2025        tmp_to_remove = []
2026
2027        # If no output, get it
2028        if not output_file:
2029            output_file = self.get_output()
2030
2031        # If not threads
2032        if not threads:
2033            threads = self.get_threads()
2034
2035        # Auto header name with extension
2036        if export_header or output_header:
2037            if not output_header:
2038                output_header = f"{output_file}.hdr"
2039            # Export header
2040            self.export_header(output_file=output_file)
2041
2042        # Switch off export header if VCF output
2043        output_file_type = get_file_format(output_file)
2044        if output_file_type in ["vcf"]:
2045            export_header = False
2046            tmp_to_remove.append(output_header)
2047
2048        # Chunk size
2049        if not chunk_size:
2050            chunk_size = config.get("chunk_size", None)
2051
2052        # Parquet partition
2053        if not parquet_partitions:
2054            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
2055        if parquet_partitions and isinstance(parquet_partitions, str):
2056            parquet_partitions = parquet_partitions.split(",")
2057
2058        # Order by
2059        if not order_by:
2060            order_by = param.get("export", {}).get("order_by", "")
2061
2062        # Header in output
2063        header_in_output = param.get("export", {}).get("include_header", False)
2064
2065        # Database
2066        database_source = self.get_connexion()
2067
2068        # Connexion format
2069        connexion_format = self.get_connexion_format()
2070
2071        # Explode infos
2072        if self.get_explode_infos():
2073            self.explode_infos(
2074                prefix=self.get_explode_infos_prefix(),
2075                fields=self.get_explode_infos_fields(),
2076                force=False,
2077            )
2078
2079        # if connexion_format in ["sqlite"] or query:
2080        if connexion_format in ["sqlite"]:
2081
2082            # Export in Parquet
2083            random_tmp = "".join(
2084                random.choice(string.ascii_lowercase) for i in range(10)
2085            )
2086            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
2087            tmp_to_remove.append(database_source)
2088
2089            # Table Variants
2090            table_variants = self.get_table_variants()
2091
2092            # Create export query
2093            sql_query_export_subquery = f"""
2094                SELECT * FROM {table_variants}
2095                """
2096
2097            # Write source file
2098            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
2099
2100        # Create database
2101        database = Database(
2102            database=database_source,
2103            table="variants",
2104            header_file=output_header,
2105            conn_config=self.get_connexion_config(),
2106        )
2107
2108        # Existing colomns header
2109        # existing_columns_header = database.get_header_file_columns(output_header)
2110        existing_columns_header = database.get_header_columns_from_database()
2111
2112        # Export file
2113        database.export(
2114            output_database=output_file,
2115            output_header=output_header,
2116            existing_columns_header=existing_columns_header,
2117            parquet_partitions=parquet_partitions,
2118            chunk_size=chunk_size,
2119            threads=threads,
2120            sort=sort,
2121            index=index,
2122            header_in_output=header_in_output,
2123            order_by=order_by,
2124            query=query,
2125            export_header=export_header,
2126        )
2127
2128        # Remove
2129        remove_if_exists(tmp_to_remove)
2130
2131        return (os.path.exists(output_file) or None) and (
2132            os.path.exists(output_file) or None
2133        )

The export_output function exports data from a VCF file to a specified output file in various formats, including VCF, CSV, TSV, PSV, and Parquet.

Parameters
  • output_file: The output_file parameter is a string that specifies the name of the output file to be generated by the function. This is where the exported data will be saved
  • output_header: The output_header parameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as the output_file parameter, but with the extension "
  • export_header: The export_header parameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. If export_header is True, the header will be exported to a file. If export_header is False, the header will not be, defaults to True, if output format is not VCF
  • query: The query parameter is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported
  • parquet_partitions: The parquet_partitions parameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets
  • chunk_size: The chunk_size parameter specifies the number of records in batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files.
  • threads: The threads parameter is an optional parameter that specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If not provided, the function will use the default number of threads
  • sort: The sort parameter is a boolean flag that determines whether the output file should be sorted or not. If sort is set to True, the output file will be sorted based on the genomic coordinates of the variants. By default, the value of sort is False, defaults to False
  • index: The index parameter is a boolean flag that determines whether an index should be created on the output file. If index is True, an index will be created. If index is False, no index will be created. The default value is False, defaults to False
  • order_by: The order_by parameter is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format
Returns

a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.

def get_extra_infos(self, table: str = None) -> list:
2135    def get_extra_infos(self, table: str = None) -> list:
2136        """
2137        The `get_extra_infos` function returns a list of columns that are in a specified table but not
2138        in the header.
2139
2140        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
2141        name of the table from which you want to retrieve the extra columns that are not present in the
2142        header. If the `table` parameter is not provided when calling the function, it will default to
2143        using the variants
2144        :type table: str
2145        :return: A list of columns that are in the specified table but not in the header of the table.
2146        """
2147
2148        header_columns = []
2149
2150        if not table:
2151            table = self.get_table_variants(clause="from")
2152            header_columns = self.get_header_columns()
2153
2154        # Check all columns in the database
2155        query = f""" SELECT * FROM {table} LIMIT 1 """
2156        log.debug(f"query {query}")
2157        table_columns = self.get_query_to_df(query).columns.tolist()
2158        extra_columns = []
2159
2160        # Construct extra infos (not in header)
2161        for column in table_columns:
2162            if column not in header_columns:
2163                extra_columns.append(column)
2164
2165        return extra_columns

The get_extra_infos function returns a list of columns that are in a specified table but not in the header.

Parameters
  • table: The table parameter in the get_extra_infos function is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If the table parameter is not provided when calling the function, it will default to using the variants
Returns

A list of columns that are in the specified table but not in the header of the table.

def get_extra_infos_sql(self, table: str = None) -> str:
2167    def get_extra_infos_sql(self, table: str = None) -> str:
2168        """
2169        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
2170        by double quotes
2171
2172        :param table: The name of the table to get the extra infos from. If None, the default table is
2173        used
2174        :type table: str
2175        :return: A string of the extra infos
2176        """
2177
2178        return ", ".join(
2179            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
2180        )

It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes

Parameters
  • table: The name of the table to get the extra infos from. If None, the default table is used
Returns

A string of the extra infos

def export_header( self, header_name: str = None, output_file: str = None, output_file_ext: str = '.hdr', clean_header: bool = True, remove_chrom_line: bool = False) -> str:
2182    def export_header(
2183        self,
2184        header_name: str = None,
2185        output_file: str = None,
2186        output_file_ext: str = ".hdr",
2187        clean_header: bool = True,
2188        remove_chrom_line: bool = False,
2189    ) -> str:
2190        """
2191        The `export_header` function takes a VCF file, extracts the header, modifies it according to
2192        specified options, and writes it to a new file.
2193
2194        :param header_name: The `header_name` parameter is the name of the header file to be created. If
2195        this parameter is not specified, the header will be written to the output file
2196        :type header_name: str
2197        :param output_file: The `output_file` parameter in the `export_header` function is used to
2198        specify the name of the output file where the header will be written. If this parameter is not
2199        provided, the header will be written to a temporary file
2200        :type output_file: str
2201        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
2202        string that represents the extension of the output header file. By default, it is set to ".hdr"
2203        if not specified by the user. This extension will be appended to the `output_file` name to
2204        create the final, defaults to .hdr
2205        :type output_file_ext: str (optional)
2206        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
2207        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
2208        `True`, the function will clean the header by modifying certain lines based on a specific
2209        pattern. If `clean_header`, defaults to True
2210        :type clean_header: bool (optional)
2211        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
2212        boolean flag that determines whether the #CHROM line should be removed from the header before
2213        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
2214        defaults to False
2215        :type remove_chrom_line: bool (optional)
2216        :return: The function `export_header` returns the name of the temporary header file that is
2217        created.
2218        """
2219
2220        if not header_name and not output_file:
2221            output_file = self.get_output()
2222
2223        if self.get_header():
2224
2225            # Get header object
2226            header_obj = self.get_header()
2227
2228            # Create database
2229            db_for_header = Database(database=self.get_input())
2230
2231            # Get real columns in the file
2232            db_header_columns = db_for_header.get_columns()
2233
2234            with tempfile.TemporaryDirectory() as tmpdir:
2235
2236                # Write header file
2237                header_file_tmp = os.path.join(tmpdir, "header")
2238                f = open(header_file_tmp, "w")
2239                vcf.Writer(f, header_obj)
2240                f.close()
2241
2242                # Replace #CHROM line with rel columns
2243                header_list = db_for_header.read_header_file(
2244                    header_file=header_file_tmp
2245                )
2246                header_list[-1] = "\t".join(db_header_columns)
2247
2248                # Remove CHROM line
2249                if remove_chrom_line:
2250                    header_list.pop()
2251
2252                # Clean header
2253                if clean_header:
2254                    header_list_clean = []
2255                    for head in header_list:
2256                        # Clean head for malformed header
2257                        head_clean = head
2258                        head_clean = re.subn(
2259                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
2260                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
2261                            head_clean,
2262                            2,
2263                        )[0]
2264                        # Write header
2265                        header_list_clean.append(head_clean)
2266                    header_list = header_list_clean
2267
2268            tmp_header_name = output_file + output_file_ext
2269
2270            f = open(tmp_header_name, "w")
2271            for line in header_list:
2272                f.write(line)
2273            f.close()
2274
2275        return tmp_header_name

The export_header function takes a VCF file, extracts the header, modifies it according to specified options, and writes it to a new file.

Parameters
  • header_name: The header_name parameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file
  • output_file: The output_file parameter in the export_header function is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file
  • output_file_ext: The output_file_ext parameter in the export_header function is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to the output_file name to create the final, defaults to .hdr
  • clean_header: The clean_header parameter in the export_header function is a boolean flag that determines whether the header should be cleaned or not. When clean_header is set to True, the function will clean the header by modifying certain lines based on a specific pattern. If clean_header, defaults to True
  • remove_chrom_line: The remove_chrom_line parameter in the export_header function is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set to True, the #CHROM line will be removed; if set to `, defaults to False
Returns

The function export_header returns the name of the temporary header file that is created.

def export_variant_vcf( self, vcf_file, remove_info: bool = False, add_samples: bool = True, list_samples: list = [], where_clause: str = '', index: bool = False, threads: int | None = None) -> bool | None:
2277    def export_variant_vcf(
2278        self,
2279        vcf_file,
2280        remove_info: bool = False,
2281        add_samples: bool = True,
2282        list_samples: list = [],
2283        where_clause: str = "",
2284        index: bool = False,
2285        threads: int | None = None,
2286    ) -> bool | None:
2287        """
2288        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
2289        remove INFO field, add samples, and control compression and indexing.
2290
2291        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
2292        written to. It is the output file that will contain the filtered VCF data based on the specified
2293        parameters
2294        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
2295        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
2296        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
2297        in, defaults to False
2298        :type remove_info: bool (optional)
2299        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
2300        the samples should be added to the VCF file or not. If set to True, the samples will be added.
2301        If set to False, the samples will be removed. The default value is True, defaults to True
2302        :type add_samples: bool (optional)
2303        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
2304        in the output VCF file. By default, all samples will be included. If you provide a list of
2305        samples, only those samples will be included in the output file
2306        :type list_samples: list
2307        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
2308        determines whether or not to create an index for the output VCF file. If `index` is set to
2309        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
2310        :type index: bool (optional)
2311        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
2312        number of threads to use for exporting the VCF file. It determines how many parallel threads
2313        will be used during the export process. More threads can potentially speed up the export process
2314        by utilizing multiple cores of the processor. If
2315        :type threads: int | None
2316        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
2317        method with various parameters including the output file, query, threads, sort flag, and index
2318        flag. The `export_output` method is responsible for exporting the VCF data based on the
2319        specified parameters and configurations provided in the `export_variant_vcf` function.
2320        """
2321
2322        # Config
2323        config = self.get_config()
2324
2325        # Extract VCF
2326        log.debug("Export VCF...")
2327
2328        # Table variants
2329        table_variants = self.get_table_variants()
2330
2331        # Threads
2332        if not threads:
2333            threads = self.get_threads()
2334
2335        # Info fields
2336        if remove_info:
2337            if not isinstance(remove_info, str):
2338                remove_info = "."
2339            info_field = f"""'{remove_info}' as INFO"""
2340        else:
2341            info_field = "INFO"
2342
2343        # Samples fields
2344        if add_samples:
2345            if not list_samples:
2346                list_samples = self.get_header_sample_list()
2347            if list_samples:
2348                samples_fields = " , FORMAT , " + " , ".join(list_samples)
2349            else:
2350                samples_fields = ""
2351            log.debug(f"samples_fields: {samples_fields}")
2352        else:
2353            samples_fields = ""
2354
2355        # Where clause
2356        if where_clause is None:
2357            where_clause = ""
2358
2359        # Variants
2360        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
2361        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
2362        log.debug(f"sql_query_select={sql_query_select}")
2363
2364        return self.export_output(
2365            output_file=vcf_file,
2366            output_header=None,
2367            export_header=True,
2368            query=sql_query_select,
2369            parquet_partitions=None,
2370            chunk_size=config.get("chunk_size", None),
2371            threads=threads,
2372            sort=True,
2373            index=index,
2374            order_by=None,
2375        )

The export_variant_vcf function exports a VCF file with specified samples, allowing options to remove INFO field, add samples, and control compression and indexing.

Parameters
  • vcf_file: The vcf_file parameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters
  • remove_info: The remove_info parameter in the export_variant_vcf function is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set to True, the INFO field will be removed. If set to False, the INFO field will be included in, defaults to False
  • add_samples: The add_samples parameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True
  • list_samples: The list_samples parameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file
  • index: The index parameter in the export_variant_vcf function is a boolean flag that determines whether or not to create an index for the output VCF file. If index is set to True, the output VCF file will be indexed using tabix. If index, defaults to False
  • threads: The threads parameter in the export_variant_vcf function specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns

The export_variant_vcf function returns the result of calling the export_output method with various parameters including the output file, query, threads, sort flag, and index flag. The export_output method is responsible for exporting the VCF data based on the specified parameters and configurations provided in the export_variant_vcf function.

def run_commands(self, commands: list = [], threads: int = 1) -> None:
2377    def run_commands(self, commands: list = [], threads: int = 1) -> None:
2378        """
2379        It takes a list of commands and runs them in parallel using the number of threads specified
2380
2381        :param commands: A list of commands to run
2382        :param threads: The number of threads to use, defaults to 1 (optional)
2383        """
2384
2385        run_parallel_commands(commands, threads)

It takes a list of commands and runs them in parallel using the number of threads specified

Parameters
  • commands: A list of commands to run
  • threads: The number of threads to use, defaults to 1 (optional)
def get_threads(self, default: int = 1) -> int:
2387    def get_threads(self, default: int = 1) -> int:
2388        """
2389        This function returns the number of threads to use for a job, with a default value of 1 if not
2390        specified.
2391
2392        :param default: The `default` parameter in the `get_threads` method is used to specify the
2393        default number of threads to use if no specific value is provided. If no value is provided for
2394        the `threads` parameter in the configuration or input parameters, the `default` value will be
2395        used, defaults to 1
2396        :type default: int (optional)
2397        :return: the number of threads to use for the current job.
2398        """
2399
2400        # Config
2401        config = self.get_config()
2402
2403        # Param
2404        param = self.get_param()
2405
2406        # Input threads
2407        input_thread = param.get("threads", config.get("threads", None))
2408
2409        # Check threads
2410        if not input_thread:
2411            threads = default
2412        elif int(input_thread) <= 0:
2413            threads = os.cpu_count()
2414        else:
2415            threads = int(input_thread)
2416        return threads

This function returns the number of threads to use for a job, with a default value of 1 if not specified.

Parameters
  • default: The default parameter in the get_threads method is used to specify the default number of threads to use if no specific value is provided. If no value is provided for the threads parameter in the configuration or input parameters, the default value will be used, defaults to 1
Returns

the number of threads to use for the current job.

def get_memory(self, default: str = None) -> str:
2418    def get_memory(self, default: str = None) -> str:
2419        """
2420        This function retrieves the memory value from parameters or configuration with a default value
2421        if not found.
2422
2423        :param default: The `get_memory` function takes in a default value as a string parameter. This
2424        default value is used as a fallback in case the `memory` parameter is not provided in the
2425        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
2426        the function
2427        :type default: str
2428        :return: The `get_memory` function returns a string value representing the memory parameter. If
2429        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
2430        return the default value provided as an argument to the function.
2431        """
2432
2433        # Config
2434        config = self.get_config()
2435
2436        # Param
2437        param = self.get_param()
2438
2439        # Input threads
2440        input_memory = param.get("memory", config.get("memory", None))
2441
2442        # Check threads
2443        if input_memory:
2444            memory = input_memory
2445        else:
2446            memory = default
2447
2448        return memory

This function retrieves the memory value from parameters or configuration with a default value if not found.

Parameters
  • default: The get_memory function takes in a default value as a string parameter. This default value is used as a fallback in case the memory parameter is not provided in the param dictionary or the config dictionary. If memory is not found in either dictionary, the function
Returns

The get_memory function returns a string value representing the memory parameter. If the input_memory is provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.

def update_from_vcf(self, vcf_file: str) -> None:
2450    def update_from_vcf(self, vcf_file: str) -> None:
2451        """
2452        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
2453
2454        :param vcf_file: the path to the VCF file
2455        """
2456
2457        connexion_format = self.get_connexion_format()
2458
2459        if connexion_format in ["duckdb"]:
2460            self.update_from_vcf_duckdb(vcf_file)
2461        elif connexion_format in ["sqlite"]:
2462            self.update_from_vcf_sqlite(vcf_file)

If the database is duckdb, then use the parquet method, otherwise use the sqlite method

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2464    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2465        """
2466        It takes a VCF file and updates the INFO column of the variants table in the database with the
2467        INFO column of the VCF file
2468
2469        :param vcf_file: the path to the VCF file
2470        """
2471
2472        # varaints table
2473        table_variants = self.get_table_variants()
2474
2475        # Loading VCF into temporaire table
2476        skip = self.get_header_length(file=vcf_file)
2477        vcf_df = pd.read_csv(
2478            vcf_file,
2479            sep="\t",
2480            engine="c",
2481            skiprows=skip,
2482            header=0,
2483            low_memory=False,
2484        )
2485        sql_query_update = f"""
2486        UPDATE {table_variants} as table_variants
2487            SET INFO = concat(
2488                            CASE
2489                                WHEN INFO NOT IN ('', '.')
2490                                THEN INFO
2491                                ELSE ''
2492                            END,
2493                            (
2494                                SELECT 
2495                                    concat(
2496                                        CASE
2497                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
2498                                            THEN ';'
2499                                            ELSE ''
2500                                        END
2501                                        ,
2502                                        CASE
2503                                            WHEN table_parquet.INFO NOT IN ('','.')
2504                                            THEN table_parquet.INFO
2505                                            ELSE ''
2506                                        END
2507                                    )
2508                                FROM vcf_df as table_parquet
2509                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
2510                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
2511                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
2512                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
2513                                        AND table_parquet.INFO NOT IN ('','.')
2514                            )
2515                        )
2516            ;
2517            """
2518        self.conn.execute(sql_query_update)

It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2520    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2521        """
2522        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
2523        table, then updates the INFO column of the variants table with the INFO column of the temporary
2524        table
2525
2526        :param vcf_file: The path to the VCF file you want to update the database with
2527        """
2528
2529        # Create a temporary table for the VCF
2530        table_vcf = "tmp_vcf"
2531        sql_create = (
2532            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
2533        )
2534        self.conn.execute(sql_create)
2535
2536        # Loading VCF into temporaire table
2537        vcf_df = pd.read_csv(
2538            vcf_file, sep="\t", comment="#", header=None, low_memory=False
2539        )
2540        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
2541        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
2542
2543        # Update table 'variants' with VCF data
2544        # warning: CONCAT as || operator
2545        sql_query_update = f"""
2546            UPDATE variants as table_variants
2547            SET INFO = CASE
2548                            WHEN INFO NOT IN ('', '.')
2549                            THEN INFO
2550                            ELSE ''
2551                        END ||
2552                        (
2553                        SELECT 
2554                            CASE 
2555                                WHEN table_variants.INFO NOT IN ('','.') 
2556                                    AND table_vcf.INFO NOT IN ('','.')  
2557                                THEN ';' 
2558                                ELSE '' 
2559                            END || 
2560                            CASE 
2561                                WHEN table_vcf.INFO NOT IN ('','.') 
2562                                THEN table_vcf.INFO 
2563                                ELSE '' 
2564                            END
2565                        FROM {table_vcf} as table_vcf
2566                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
2567                            AND table_vcf.\"POS\" = table_variants.\"POS\"
2568                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
2569                            AND table_vcf.\"REF\" = table_variants.\"REF\"
2570                        )
2571        """
2572        self.conn.execute(sql_query_update)
2573
2574        # Drop temporary table
2575        sql_drop = f"DROP TABLE {table_vcf}"
2576        self.conn.execute(sql_drop)

It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table

Parameters
  • vcf_file: The path to the VCF file you want to update the database with
def drop_variants_table(self) -> None:
2578    def drop_variants_table(self) -> None:
2579        """
2580        > This function drops the variants table
2581        """
2582
2583        table_variants = self.get_table_variants()
2584        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
2585        self.conn.execute(sql_table_variants)

This function drops the variants table

def set_variant_id(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2587    def set_variant_id(
2588        self, variant_id_column: str = "variant_id", force: bool = None
2589    ) -> str:
2590        """
2591        It adds a column to the variants table called `variant_id` and populates it with a hash of the
2592        `#CHROM`, `POS`, `REF`, and `ALT` columns
2593
2594        :param variant_id_column: The name of the column to be created in the variants table, defaults
2595        to variant_id
2596        :type variant_id_column: str (optional)
2597        :param force: If True, the variant_id column will be created even if it already exists
2598        :type force: bool
2599        :return: The name of the column that contains the variant_id
2600        """
2601
2602        # Assembly
2603        assembly = self.get_param().get(
2604            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
2605        )
2606
2607        # INFO/Tag prefix
2608        prefix = self.get_explode_infos_prefix()
2609
2610        # Explode INFO/SVTYPE
2611        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
2612
2613        # variants table
2614        table_variants = self.get_table_variants()
2615
2616        # variant_id column
2617        if not variant_id_column:
2618            variant_id_column = "variant_id"
2619
2620        # Creta variant_id column
2621        if "variant_id" not in self.get_extra_infos() or force:
2622
2623            # Create column
2624            self.add_column(
2625                table_name=table_variants,
2626                column_name=variant_id_column,
2627                column_type="UBIGINT",
2628                default_value="0",
2629            )
2630
2631            # Update column
2632            self.conn.execute(
2633                f"""
2634                    UPDATE {table_variants}
2635                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
2636                """
2637            )
2638
2639        # Remove added columns
2640        for added_column in added_columns:
2641            self.drop_column(column=added_column)
2642
2643        # return variant_id column name
2644        return variant_id_column

It adds a column to the variants table called variant_id and populates it with a hash of the #CHROM, POS, REF, and ALT columns

Parameters
  • variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
  • force: If True, the variant_id column will be created even if it already exists
Returns

The name of the column that contains the variant_id

def get_variant_id_column(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2646    def get_variant_id_column(
2647        self, variant_id_column: str = "variant_id", force: bool = None
2648    ) -> str:
2649        """
2650        This function returns the variant_id column name
2651
2652        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
2653        defaults to variant_id
2654        :type variant_id_column: str (optional)
2655        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
2656        False, will only set the variant_id if it is not already set. If None, will set the variant_id
2657        if it is not already set, or if it is set
2658        :type force: bool
2659        :return: The variant_id column name.
2660        """
2661
2662        return self.set_variant_id(variant_id_column=variant_id_column, force=force)

This function returns the variant_id column name

Parameters
  • variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
  • force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns

The variant_id column name.

def scan_databases( self, database_formats: list = ['parquet'], database_releases: list = ['current']) -> dict:
2668    def scan_databases(
2669        self,
2670        database_formats: list = ["parquet"],
2671        database_releases: list = ["current"],
2672    ) -> dict:
2673        """
2674        The function `scan_databases` scans for available databases based on specified formats and
2675        releases.
2676
2677        :param database_formats: The `database_formats` parameter is a list that specifies the formats
2678        of the databases to be scanned. In this case, the accepted format is "parquet"
2679        :type database_formats: list ["parquet"]
2680        :param database_releases: The `database_releases` parameter is a list that specifies the
2681        releases of the databases to be scanned. In the provided function, the default value for
2682        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
2683        databases that are in the "current"
2684        :type database_releases: list
2685        :return: The function `scan_databases` returns a dictionary containing information about
2686        databases that match the specified formats and releases.
2687        """
2688
2689        # Config
2690        config = self.get_config()
2691
2692        # Param
2693        param = self.get_param()
2694
2695        # Param - Assembly
2696        assembly = param.get("assembly", config.get("assembly", None))
2697        if not assembly:
2698            assembly = DEFAULT_ASSEMBLY
2699            log.warning(f"Default assembly '{assembly}'")
2700
2701        # Scan for availabled databases
2702        log.info(
2703            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
2704        )
2705        databases_infos_dict = databases_infos(
2706            database_folder_releases=database_releases,
2707            database_formats=database_formats,
2708            assembly=assembly,
2709            config=config,
2710        )
2711        log.info(
2712            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
2713        )
2714
2715        return databases_infos_dict

The function scan_databases scans for available databases based on specified formats and releases.

Parameters
  • database_formats: The database_formats parameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet"
  • database_releases: The database_releases parameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value for database_releases is set to ["current"], meaning that by default, the function will scan databases that are in the "current"
Returns

The function scan_databases returns a dictionary containing information about databases that match the specified formats and releases.

def annotation(self) -> None:
2717    def annotation(self) -> None:
2718        """
2719        It annotates the VCF file with the annotations specified in the config file.
2720        """
2721
2722        # Config
2723        config = self.get_config()
2724
2725        # Param
2726        param = self.get_param()
2727
2728        # Param - Assembly
2729        assembly = param.get("assembly", config.get("assembly", None))
2730        if not assembly:
2731            assembly = DEFAULT_ASSEMBLY
2732            log.warning(f"Default assembly '{assembly}'")
2733
2734        # annotations databases folders
2735        annotations_databases = set(
2736            config.get("folders", {})
2737            .get("databases", {})
2738            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
2739            + config.get("folders", {})
2740            .get("databases", {})
2741            .get("parquet", ["~/howard/databases/parquet/current"])
2742            + config.get("folders", {})
2743            .get("databases", {})
2744            .get("bcftools", ["~/howard/databases/bcftools/current"])
2745        )
2746
2747        # Get param annotations
2748        if param.get("annotations", None) and isinstance(
2749            param.get("annotations", None), str
2750        ):
2751            log.debug(param.get("annotations", None))
2752            param_annotation_list = param.get("annotations").split(",")
2753        else:
2754            param_annotation_list = []
2755
2756        # Each tools param
2757        if param.get("annotation_parquet", None) != None:
2758            log.debug(
2759                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
2760            )
2761            if isinstance(param.get("annotation_parquet", None), list):
2762                param_annotation_list.append(",".join(param.get("annotation_parquet")))
2763            else:
2764                param_annotation_list.append(param.get("annotation_parquet"))
2765        if param.get("annotation_snpsift", None) != None:
2766            if isinstance(param.get("annotation_snpsift", None), list):
2767                param_annotation_list.append(
2768                    "snpsift:"
2769                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
2770                )
2771            else:
2772                param_annotation_list.append(
2773                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
2774                )
2775        if param.get("annotation_snpeff", None) != None:
2776            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
2777        if param.get("annotation_bcftools", None) != None:
2778            if isinstance(param.get("annotation_bcftools", None), list):
2779                param_annotation_list.append(
2780                    "bcftools:"
2781                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
2782                )
2783            else:
2784                param_annotation_list.append(
2785                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
2786                )
2787        if param.get("annotation_annovar", None) != None:
2788            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
2789        if param.get("annotation_exomiser", None) != None:
2790            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
2791        if param.get("annotation_splice", None) != None:
2792            param_annotation_list.append("splice:" + param.get("annotation_splice"))
2793
2794        # Merge param annotations list
2795        param["annotations"] = ",".join(param_annotation_list)
2796
2797        # debug
2798        log.debug(f"param_annotations={param['annotations']}")
2799
2800        if param.get("annotations"):
2801
2802            # Log
2803            # log.info("Annotations - Check annotation parameters")
2804
2805            if not "annotation" in param:
2806                param["annotation"] = {}
2807
2808            # List of annotations parameters
2809            annotations_list_input = {}
2810            if isinstance(param.get("annotations", None), str):
2811                annotation_file_list = [
2812                    value for value in param.get("annotations", "").split(",")
2813                ]
2814                for annotation_file in annotation_file_list:
2815                    annotations_list_input[annotation_file] = {"INFO": None}
2816            else:
2817                annotations_list_input = param.get("annotations", {})
2818
2819            log.info(f"Quick Annotations:")
2820            for annotation_key in list(annotations_list_input.keys()):
2821                log.info(f"   {annotation_key}")
2822
2823            # List of annotations and associated fields
2824            annotations_list = {}
2825
2826            for annotation_file in annotations_list_input:
2827
2828                # Explode annotations if ALL
2829                if (
2830                    annotation_file.upper() == "ALL"
2831                    or annotation_file.upper().startswith("ALL:")
2832                ):
2833
2834                    # check ALL parameters (formats, releases)
2835                    annotation_file_split = annotation_file.split(":")
2836                    database_formats = "parquet"
2837                    database_releases = "current"
2838                    for annotation_file_option in annotation_file_split[1:]:
2839                        database_all_options_split = annotation_file_option.split("=")
2840                        if database_all_options_split[0] == "format":
2841                            database_formats = database_all_options_split[1].split("+")
2842                        if database_all_options_split[0] == "release":
2843                            database_releases = database_all_options_split[1].split("+")
2844
2845                    # Scan for availabled databases
2846                    databases_infos_dict = self.scan_databases(
2847                        database_formats=database_formats,
2848                        database_releases=database_releases,
2849                    )
2850
2851                    # Add found databases in annotation parameters
2852                    for database_infos in databases_infos_dict.keys():
2853                        annotations_list[database_infos] = {"INFO": None}
2854
2855                else:
2856                    annotations_list[annotation_file] = annotations_list_input[
2857                        annotation_file
2858                    ]
2859
2860            # Check each databases
2861            if len(annotations_list):
2862
2863                log.info(
2864                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
2865                )
2866
2867                for annotation_file in annotations_list:
2868
2869                    # Init
2870                    annotations = annotations_list.get(annotation_file, None)
2871
2872                    # Annotation snpEff
2873                    if annotation_file.startswith("snpeff"):
2874
2875                        log.debug(f"Quick Annotation snpEff")
2876
2877                        if "snpeff" not in param["annotation"]:
2878                            param["annotation"]["snpeff"] = {}
2879
2880                        if "options" not in param["annotation"]["snpeff"]:
2881                            param["annotation"]["snpeff"]["options"] = ""
2882
2883                        # snpEff options in annotations
2884                        param["annotation"]["snpeff"]["options"] = "".join(
2885                            annotation_file.split(":")[1:]
2886                        )
2887
2888                    # Annotation Annovar
2889                    elif annotation_file.startswith("annovar"):
2890
2891                        log.debug(f"Quick Annotation Annovar")
2892
2893                        if "annovar" not in param["annotation"]:
2894                            param["annotation"]["annovar"] = {}
2895
2896                        if "annotations" not in param["annotation"]["annovar"]:
2897                            param["annotation"]["annovar"]["annotations"] = {}
2898
2899                        # Options
2900                        annotation_file_split = annotation_file.split(":")
2901                        for annotation_file_annotation in annotation_file_split[1:]:
2902                            if annotation_file_annotation:
2903                                param["annotation"]["annovar"]["annotations"][
2904                                    annotation_file_annotation
2905                                ] = annotations
2906
2907                    # Annotation Exomiser
2908                    elif annotation_file.startswith("exomiser"):
2909
2910                        log.debug(f"Quick Annotation Exomiser")
2911
2912                        param["annotation"]["exomiser"] = params_string_to_dict(
2913                            annotation_file
2914                        )
2915
2916                    # Annotation Splice
2917                    elif annotation_file.startswith("splice"):
2918
2919                        log.debug(f"Quick Annotation Splice")
2920
2921                        param["annotation"]["splice"] = params_string_to_dict(
2922                            annotation_file
2923                        )
2924
2925                    # Annotation Parquet or BCFTOOLS
2926                    else:
2927
2928                        # Tools detection
2929                        if annotation_file.startswith("bcftools:"):
2930                            annotation_tool_initial = "bcftools"
2931                            annotation_file = ":".join(annotation_file.split(":")[1:])
2932                        elif annotation_file.startswith("snpsift:"):
2933                            annotation_tool_initial = "snpsift"
2934                            annotation_file = ":".join(annotation_file.split(":")[1:])
2935                        else:
2936                            annotation_tool_initial = None
2937
2938                        # list of files
2939                        annotation_file_list = annotation_file.replace("+", ":").split(
2940                            ":"
2941                        )
2942
2943                        for annotation_file in annotation_file_list:
2944
2945                            if annotation_file:
2946
2947                                # Annotation tool initial
2948                                annotation_tool = annotation_tool_initial
2949
2950                                # Find file
2951                                annotation_file_found = None
2952
2953                                # Expand user
2954                                annotation_file = full_path(annotation_file)
2955
2956                                if os.path.exists(annotation_file):
2957                                    annotation_file_found = annotation_file
2958
2959                                else:
2960                                    # Find within assembly folders
2961                                    for annotations_database in annotations_databases:
2962                                        found_files = find_all(
2963                                            annotation_file,
2964                                            os.path.join(
2965                                                annotations_database, assembly
2966                                            ),
2967                                        )
2968                                        if len(found_files) > 0:
2969                                            annotation_file_found = found_files[0]
2970                                            break
2971                                    if not annotation_file_found and not assembly:
2972                                        # Find within folders
2973                                        for (
2974                                            annotations_database
2975                                        ) in annotations_databases:
2976                                            found_files = find_all(
2977                                                annotation_file, annotations_database
2978                                            )
2979                                            if len(found_files) > 0:
2980                                                annotation_file_found = found_files[0]
2981                                                break
2982                                log.debug(
2983                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
2984                                )
2985
2986                                # Full path
2987                                annotation_file_found = full_path(annotation_file_found)
2988
2989                                if annotation_file_found:
2990
2991                                    database = Database(database=annotation_file_found)
2992                                    quick_annotation_format = database.get_format()
2993                                    quick_annotation_is_compressed = (
2994                                        database.is_compressed()
2995                                    )
2996                                    quick_annotation_is_indexed = os.path.exists(
2997                                        f"{annotation_file_found}.tbi"
2998                                    )
2999                                    bcftools_preference = False
3000
3001                                    # Check Annotation Tool
3002                                    if not annotation_tool:
3003                                        if (
3004                                            bcftools_preference
3005                                            and quick_annotation_format
3006                                            in ["vcf", "bed"]
3007                                            and quick_annotation_is_compressed
3008                                            and quick_annotation_is_indexed
3009                                        ):
3010                                            annotation_tool = "bcftools"
3011                                        elif quick_annotation_format in [
3012                                            "vcf",
3013                                            "bed",
3014                                            "tsv",
3015                                            "tsv",
3016                                            "csv",
3017                                            "json",
3018                                            "tbl",
3019                                            "parquet",
3020                                            "duckdb",
3021                                        ]:
3022                                            annotation_tool = "parquet"
3023                                        else:
3024                                            log.error(
3025                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3026                                            )
3027                                            raise ValueError(
3028                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3029                                            )
3030
3031                                    log.debug(
3032                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
3033                                    )
3034
3035                                    # Annotation Tool dispatch
3036                                    if annotation_tool:
3037                                        if annotation_tool not in param["annotation"]:
3038                                            param["annotation"][annotation_tool] = {}
3039                                        if (
3040                                            "annotations"
3041                                            not in param["annotation"][annotation_tool]
3042                                        ):
3043                                            param["annotation"][annotation_tool][
3044                                                "annotations"
3045                                            ] = {}
3046                                        param["annotation"][annotation_tool][
3047                                            "annotations"
3048                                        ][annotation_file_found] = annotations
3049
3050                                else:
3051                                    log.error(
3052                                        f"Quick Annotation File {annotation_file} does NOT exist"
3053                                    )
3054
3055                self.set_param(param)
3056
3057        if param.get("annotation", None):
3058            log.info("Annotations")
3059            if param.get("annotation", {}).get("parquet", None):
3060                log.info("Annotations 'parquet'...")
3061                self.annotation_parquet()
3062            if param.get("annotation", {}).get("bcftools", None):
3063                log.info("Annotations 'bcftools'...")
3064                self.annotation_bcftools()
3065            if param.get("annotation", {}).get("snpsift", None):
3066                log.info("Annotations 'snpsift'...")
3067                self.annotation_snpsift()
3068            if param.get("annotation", {}).get("annovar", None):
3069                log.info("Annotations 'annovar'...")
3070                self.annotation_annovar()
3071            if param.get("annotation", {}).get("snpeff", None):
3072                log.info("Annotations 'snpeff'...")
3073                self.annotation_snpeff()
3074            if param.get("annotation", {}).get("exomiser", None) is not None:
3075                log.info("Annotations 'exomiser'...")
3076                self.annotation_exomiser()
3077            if param.get("annotation", {}).get("splice", None) is not None:
3078                log.info("Annotations 'splice' ...")
3079                self.annotation_splice()
3080
3081        # Explode INFOS fields into table fields
3082        if self.get_explode_infos():
3083            self.explode_infos(
3084                prefix=self.get_explode_infos_prefix(),
3085                fields=self.get_explode_infos_fields(),
3086                force=True,
3087            )

It annotates the VCF file with the annotations specified in the config file.

def annotation_snpsift(self, threads: int = None) -> None:
3089    def annotation_snpsift(self, threads: int = None) -> None:
3090        """
3091        This function annotate with bcftools
3092
3093        :param threads: Number of threads to use
3094        :return: the value of the variable "return_value".
3095        """
3096
3097        # DEBUG
3098        log.debug("Start annotation with bcftools databases")
3099
3100        # Threads
3101        if not threads:
3102            threads = self.get_threads()
3103        log.debug("Threads: " + str(threads))
3104
3105        # Config
3106        config = self.get_config()
3107        log.debug("Config: " + str(config))
3108
3109        # Config - snpSift
3110        snpsift_bin_command = get_bin_command(
3111            bin="SnpSift.jar",
3112            tool="snpsift",
3113            bin_type="jar",
3114            config=config,
3115            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
3116        )
3117        if not snpsift_bin_command:
3118            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
3119            log.error(msg_err)
3120            raise ValueError(msg_err)
3121
3122        # Config - bcftools
3123        bcftools_bin_command = get_bin_command(
3124            bin="bcftools",
3125            tool="bcftools",
3126            bin_type="bin",
3127            config=config,
3128            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3129        )
3130        if not bcftools_bin_command:
3131            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3132            log.error(msg_err)
3133            raise ValueError(msg_err)
3134
3135        # Config - BCFTools databases folders
3136        databases_folders = set(
3137            self.get_config()
3138            .get("folders", {})
3139            .get("databases", {})
3140            .get("annotations", ["."])
3141            + self.get_config()
3142            .get("folders", {})
3143            .get("databases", {})
3144            .get("bcftools", ["."])
3145        )
3146        log.debug("Databases annotations: " + str(databases_folders))
3147
3148        # Param
3149        annotations = (
3150            self.get_param()
3151            .get("annotation", {})
3152            .get("snpsift", {})
3153            .get("annotations", None)
3154        )
3155        log.debug("Annotations: " + str(annotations))
3156
3157        # Assembly
3158        assembly = self.get_param().get(
3159            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3160        )
3161
3162        # Data
3163        table_variants = self.get_table_variants()
3164
3165        # Check if not empty
3166        log.debug("Check if not empty")
3167        sql_query_chromosomes = (
3168            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3169        )
3170        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3171        if not sql_query_chromosomes_df["count"][0]:
3172            log.info(f"VCF empty")
3173            return
3174
3175        # VCF header
3176        vcf_reader = self.get_header()
3177        log.debug("Initial header: " + str(vcf_reader.infos))
3178
3179        # Existing annotations
3180        for vcf_annotation in self.get_header().infos:
3181
3182            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3183            log.debug(
3184                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3185            )
3186
3187        if annotations:
3188
3189            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3190
3191                # Export VCF file
3192                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3193
3194                # Init
3195                commands = {}
3196
3197                for annotation in annotations:
3198                    annotation_fields = annotations[annotation]
3199
3200                    # Annotation Name
3201                    annotation_name = os.path.basename(annotation)
3202
3203                    if not annotation_fields:
3204                        annotation_fields = {"INFO": None}
3205
3206                    log.debug(f"Annotation '{annotation_name}'")
3207                    log.debug(
3208                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3209                    )
3210
3211                    # Create Database
3212                    database = Database(
3213                        database=annotation,
3214                        databases_folders=databases_folders,
3215                        assembly=assembly,
3216                    )
3217
3218                    # Find files
3219                    db_file = database.get_database()
3220                    db_file = full_path(db_file)
3221                    db_hdr_file = database.get_header_file()
3222                    db_hdr_file = full_path(db_hdr_file)
3223                    db_file_type = database.get_format()
3224                    db_tbi_file = f"{db_file}.tbi"
3225                    db_file_compressed = database.is_compressed()
3226
3227                    # Check if compressed
3228                    if not db_file_compressed:
3229                        log.error(
3230                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3231                        )
3232                        raise ValueError(
3233                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3234                        )
3235
3236                    # Check if indexed
3237                    if not os.path.exists(db_tbi_file):
3238                        log.error(
3239                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3240                        )
3241                        raise ValueError(
3242                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3243                        )
3244
3245                    # Check index - try to create if not exists
3246                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3247                        log.error("Annotation failed: database not valid")
3248                        log.error(f"Annotation annotation file: {db_file}")
3249                        log.error(f"Annotation annotation header: {db_hdr_file}")
3250                        log.error(f"Annotation annotation index: {db_tbi_file}")
3251                        raise ValueError(
3252                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3253                        )
3254                    else:
3255
3256                        log.debug(
3257                            f"Annotation '{annotation}' - file: "
3258                            + str(db_file)
3259                            + " and "
3260                            + str(db_hdr_file)
3261                        )
3262
3263                        # Load header as VCF object
3264                        db_hdr_vcf = Variants(input=db_hdr_file)
3265                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3266                        log.debug(
3267                            "Annotation database header: "
3268                            + str(db_hdr_vcf_header_infos)
3269                        )
3270
3271                        # For all fields in database
3272                        annotation_fields_full = False
3273                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3274                            annotation_fields = {
3275                                key: key for key in db_hdr_vcf_header_infos
3276                            }
3277                            log.debug(
3278                                "Annotation database header - All annotations added: "
3279                                + str(annotation_fields)
3280                            )
3281                            annotation_fields_full = True
3282
3283                        # # Create file for field rename
3284                        # log.debug("Create file for field rename")
3285                        # tmp_rename = NamedTemporaryFile(
3286                        #     prefix=self.get_prefix(),
3287                        #     dir=self.get_tmp_dir(),
3288                        #     suffix=".rename",
3289                        #     delete=False,
3290                        # )
3291                        # tmp_rename_name = tmp_rename.name
3292                        # tmp_files.append(tmp_rename_name)
3293
3294                        # Number of fields
3295                        nb_annotation_field = 0
3296                        annotation_list = []
3297                        annotation_infos_rename_list = []
3298
3299                        for annotation_field in annotation_fields:
3300
3301                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3302                            annotation_fields_new_name = annotation_fields.get(
3303                                annotation_field, annotation_field
3304                            )
3305                            if not annotation_fields_new_name:
3306                                annotation_fields_new_name = annotation_field
3307
3308                            # Check if field is in DB and if field is not elready in input data
3309                            if (
3310                                annotation_field in db_hdr_vcf.get_header().infos
3311                                and annotation_fields_new_name
3312                                not in self.get_header().infos
3313                            ):
3314
3315                                log.info(
3316                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3317                                )
3318
3319                                # BCFTools annotate param to rename fields
3320                                if annotation_field != annotation_fields_new_name:
3321                                    annotation_infos_rename_list.append(
3322                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3323                                    )
3324
3325                                # Add INFO field to header
3326                                db_hdr_vcf_header_infos_number = (
3327                                    db_hdr_vcf_header_infos[annotation_field].num or "."
3328                                )
3329                                db_hdr_vcf_header_infos_type = (
3330                                    db_hdr_vcf_header_infos[annotation_field].type
3331                                    or "String"
3332                                )
3333                                db_hdr_vcf_header_infos_description = (
3334                                    db_hdr_vcf_header_infos[annotation_field].desc
3335                                    or f"{annotation_field} description"
3336                                )
3337                                db_hdr_vcf_header_infos_source = (
3338                                    db_hdr_vcf_header_infos[annotation_field].source
3339                                    or "unknown"
3340                                )
3341                                db_hdr_vcf_header_infos_version = (
3342                                    db_hdr_vcf_header_infos[annotation_field].version
3343                                    or "unknown"
3344                                )
3345
3346                                vcf_reader.infos[annotation_fields_new_name] = (
3347                                    vcf.parser._Info(
3348                                        annotation_fields_new_name,
3349                                        db_hdr_vcf_header_infos_number,
3350                                        db_hdr_vcf_header_infos_type,
3351                                        db_hdr_vcf_header_infos_description,
3352                                        db_hdr_vcf_header_infos_source,
3353                                        db_hdr_vcf_header_infos_version,
3354                                        self.code_type_map[
3355                                            db_hdr_vcf_header_infos_type
3356                                        ],
3357                                    )
3358                                )
3359
3360                                annotation_list.append(annotation_field)
3361
3362                                nb_annotation_field += 1
3363
3364                            else:
3365
3366                                if (
3367                                    annotation_field
3368                                    not in db_hdr_vcf.get_header().infos
3369                                ):
3370                                    log.warning(
3371                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
3372                                    )
3373                                if (
3374                                    annotation_fields_new_name
3375                                    in self.get_header().infos
3376                                ):
3377                                    log.warning(
3378                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
3379                                    )
3380
3381                        log.info(
3382                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3383                        )
3384
3385                        annotation_infos = ",".join(annotation_list)
3386
3387                        if annotation_infos != "":
3388
3389                            # Annotated VCF (and error file)
3390                            tmp_annotation_vcf_name = os.path.join(
3391                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
3392                            )
3393                            tmp_annotation_vcf_name_err = (
3394                                tmp_annotation_vcf_name + ".err"
3395                            )
3396
3397                            # Add fields to annotate
3398                            if not annotation_fields_full:
3399                                annotation_infos_option = f"-info {annotation_infos}"
3400                            else:
3401                                annotation_infos_option = ""
3402
3403                            # Info fields rename
3404                            if annotation_infos_rename_list:
3405                                annotation_infos_rename = " -c " + ",".join(
3406                                    annotation_infos_rename_list
3407                                )
3408                            else:
3409                                annotation_infos_rename = ""
3410
3411                            # Annotate command
3412                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3413
3414                            # Add command
3415                            commands[command_annotate] = tmp_annotation_vcf_name
3416
3417                if commands:
3418
3419                    # Export VCF file
3420                    self.export_variant_vcf(
3421                        vcf_file=tmp_vcf_name,
3422                        remove_info=True,
3423                        add_samples=False,
3424                        index=True,
3425                    )
3426                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
3427
3428                    # Num command
3429                    nb_command = 0
3430
3431                    # Annotate
3432                    for command_annotate in commands:
3433                        nb_command += 1
3434                        log.info(
3435                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
3436                        )
3437                        log.debug(f"command_annotate={command_annotate}")
3438                        run_parallel_commands([command_annotate], threads)
3439
3440                        # Debug
3441                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
3442
3443                        # Update variants
3444                        log.info(
3445                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
3446                        )
3447                        self.update_from_vcf(commands[command_annotate])

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_bcftools(self, threads: int = None) -> None:
3449    def annotation_bcftools(self, threads: int = None) -> None:
3450        """
3451        This function annotate with bcftools
3452
3453        :param threads: Number of threads to use
3454        :return: the value of the variable "return_value".
3455        """
3456
3457        # DEBUG
3458        log.debug("Start annotation with bcftools databases")
3459
3460        # Threads
3461        if not threads:
3462            threads = self.get_threads()
3463        log.debug("Threads: " + str(threads))
3464
3465        # Config
3466        config = self.get_config()
3467        log.debug("Config: " + str(config))
3468
3469        # DEBUG
3470        delete_tmp = True
3471        if self.get_config().get("verbosity", "warning") in ["debug"]:
3472            delete_tmp = False
3473            log.debug("Delete tmp files/folders: " + str(delete_tmp))
3474
3475        # Config - BCFTools bin command
3476        bcftools_bin_command = get_bin_command(
3477            bin="bcftools",
3478            tool="bcftools",
3479            bin_type="bin",
3480            config=config,
3481            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3482        )
3483        if not bcftools_bin_command:
3484            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3485            log.error(msg_err)
3486            raise ValueError(msg_err)
3487
3488        # Config - BCFTools databases folders
3489        databases_folders = set(
3490            self.get_config()
3491            .get("folders", {})
3492            .get("databases", {})
3493            .get("annotations", ["."])
3494            + self.get_config()
3495            .get("folders", {})
3496            .get("databases", {})
3497            .get("bcftools", ["."])
3498        )
3499        log.debug("Databases annotations: " + str(databases_folders))
3500
3501        # Param
3502        annotations = (
3503            self.get_param()
3504            .get("annotation", {})
3505            .get("bcftools", {})
3506            .get("annotations", None)
3507        )
3508        log.debug("Annotations: " + str(annotations))
3509
3510        # Assembly
3511        assembly = self.get_param().get(
3512            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3513        )
3514
3515        # Data
3516        table_variants = self.get_table_variants()
3517
3518        # Check if not empty
3519        log.debug("Check if not empty")
3520        sql_query_chromosomes = (
3521            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3522        )
3523        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3524        if not sql_query_chromosomes_df["count"][0]:
3525            log.info(f"VCF empty")
3526            return
3527
3528        # Export in VCF
3529        log.debug("Create initial file to annotate")
3530        tmp_vcf = NamedTemporaryFile(
3531            prefix=self.get_prefix(),
3532            dir=self.get_tmp_dir(),
3533            suffix=".vcf.gz",
3534            delete=False,
3535        )
3536        tmp_vcf_name = tmp_vcf.name
3537
3538        # VCF header
3539        vcf_reader = self.get_header()
3540        log.debug("Initial header: " + str(vcf_reader.infos))
3541
3542        # Existing annotations
3543        for vcf_annotation in self.get_header().infos:
3544
3545            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3546            log.debug(
3547                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3548            )
3549
3550        if annotations:
3551
3552            tmp_ann_vcf_list = []
3553            commands = []
3554            tmp_files = []
3555            err_files = []
3556
3557            for annotation in annotations:
3558                annotation_fields = annotations[annotation]
3559
3560                # Annotation Name
3561                annotation_name = os.path.basename(annotation)
3562
3563                if not annotation_fields:
3564                    annotation_fields = {"INFO": None}
3565
3566                log.debug(f"Annotation '{annotation_name}'")
3567                log.debug(
3568                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3569                )
3570
3571                # Create Database
3572                database = Database(
3573                    database=annotation,
3574                    databases_folders=databases_folders,
3575                    assembly=assembly,
3576                )
3577
3578                # Find files
3579                db_file = database.get_database()
3580                db_file = full_path(db_file)
3581                db_hdr_file = database.get_header_file()
3582                db_hdr_file = full_path(db_hdr_file)
3583                db_file_type = database.get_format()
3584                db_tbi_file = f"{db_file}.tbi"
3585                db_file_compressed = database.is_compressed()
3586
3587                # Check if compressed
3588                if not db_file_compressed:
3589                    log.error(
3590                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3591                    )
3592                    raise ValueError(
3593                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3594                    )
3595
3596                # Check if indexed
3597                if not os.path.exists(db_tbi_file):
3598                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
3599                    raise ValueError(
3600                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
3601                    )
3602
3603                # Check index - try to create if not exists
3604                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3605                    log.error("Annotation failed: database not valid")
3606                    log.error(f"Annotation annotation file: {db_file}")
3607                    log.error(f"Annotation annotation header: {db_hdr_file}")
3608                    log.error(f"Annotation annotation index: {db_tbi_file}")
3609                    raise ValueError(
3610                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3611                    )
3612                else:
3613
3614                    log.debug(
3615                        f"Annotation '{annotation}' - file: "
3616                        + str(db_file)
3617                        + " and "
3618                        + str(db_hdr_file)
3619                    )
3620
3621                    # Load header as VCF object
3622                    db_hdr_vcf = Variants(input=db_hdr_file)
3623                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3624                    log.debug(
3625                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
3626                    )
3627
3628                    # For all fields in database
3629                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
3630                        annotation_fields = {
3631                            key: key for key in db_hdr_vcf_header_infos
3632                        }
3633                        log.debug(
3634                            "Annotation database header - All annotations added: "
3635                            + str(annotation_fields)
3636                        )
3637
3638                    # Number of fields
3639                    nb_annotation_field = 0
3640                    annotation_list = []
3641
3642                    for annotation_field in annotation_fields:
3643
3644                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3645                        annotation_fields_new_name = annotation_fields.get(
3646                            annotation_field, annotation_field
3647                        )
3648                        if not annotation_fields_new_name:
3649                            annotation_fields_new_name = annotation_field
3650
3651                        # Check if field is in DB and if field is not elready in input data
3652                        if (
3653                            annotation_field in db_hdr_vcf.get_header().infos
3654                            and annotation_fields_new_name
3655                            not in self.get_header().infos
3656                        ):
3657
3658                            log.info(
3659                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3660                            )
3661
3662                            # Add INFO field to header
3663                            db_hdr_vcf_header_infos_number = (
3664                                db_hdr_vcf_header_infos[annotation_field].num or "."
3665                            )
3666                            db_hdr_vcf_header_infos_type = (
3667                                db_hdr_vcf_header_infos[annotation_field].type
3668                                or "String"
3669                            )
3670                            db_hdr_vcf_header_infos_description = (
3671                                db_hdr_vcf_header_infos[annotation_field].desc
3672                                or f"{annotation_field} description"
3673                            )
3674                            db_hdr_vcf_header_infos_source = (
3675                                db_hdr_vcf_header_infos[annotation_field].source
3676                                or "unknown"
3677                            )
3678                            db_hdr_vcf_header_infos_version = (
3679                                db_hdr_vcf_header_infos[annotation_field].version
3680                                or "unknown"
3681                            )
3682
3683                            vcf_reader.infos[annotation_fields_new_name] = (
3684                                vcf.parser._Info(
3685                                    annotation_fields_new_name,
3686                                    db_hdr_vcf_header_infos_number,
3687                                    db_hdr_vcf_header_infos_type,
3688                                    db_hdr_vcf_header_infos_description,
3689                                    db_hdr_vcf_header_infos_source,
3690                                    db_hdr_vcf_header_infos_version,
3691                                    self.code_type_map[db_hdr_vcf_header_infos_type],
3692                                )
3693                            )
3694
3695                            # annotation_list.append(annotation_field)
3696                            if annotation_field != annotation_fields_new_name:
3697                                annotation_list.append(
3698                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3699                                )
3700                            else:
3701                                annotation_list.append(annotation_field)
3702
3703                            nb_annotation_field += 1
3704
3705                        else:
3706
3707                            if annotation_field not in db_hdr_vcf.get_header().infos:
3708                                log.warning(
3709                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
3710                                )
3711                            if annotation_fields_new_name in self.get_header().infos:
3712                                log.warning(
3713                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
3714                                )
3715
3716                    log.info(
3717                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3718                    )
3719
3720                    annotation_infos = ",".join(annotation_list)
3721
3722                    if annotation_infos != "":
3723
3724                        # Protect header for bcftools (remove "#CHROM" and variants line)
3725                        log.debug("Protect Header file - remove #CHROM line if exists")
3726                        tmp_header_vcf = NamedTemporaryFile(
3727                            prefix=self.get_prefix(),
3728                            dir=self.get_tmp_dir(),
3729                            suffix=".hdr",
3730                            delete=False,
3731                        )
3732                        tmp_header_vcf_name = tmp_header_vcf.name
3733                        tmp_files.append(tmp_header_vcf_name)
3734                        # Command
3735                        if db_hdr_file.endswith(".gz"):
3736                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3737                        else:
3738                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3739                        # Run
3740                        run_parallel_commands([command_extract_header], 1)
3741
3742                        # Find chomosomes
3743                        log.debug("Find chromosomes ")
3744                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
3745                        sql_query_chromosomes_df = self.get_query_to_df(
3746                            sql_query_chromosomes
3747                        )
3748                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
3749
3750                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
3751
3752                        # BED columns in the annotation file
3753                        if db_file_type in ["bed"]:
3754                            annotation_infos = "CHROM,POS,POS," + annotation_infos
3755
3756                        for chrom in chomosomes_list:
3757
3758                            # Create BED on initial VCF
3759                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
3760                            tmp_bed = NamedTemporaryFile(
3761                                prefix=self.get_prefix(),
3762                                dir=self.get_tmp_dir(),
3763                                suffix=".bed",
3764                                delete=False,
3765                            )
3766                            tmp_bed_name = tmp_bed.name
3767                            tmp_files.append(tmp_bed_name)
3768
3769                            # Detecte regions
3770                            log.debug(
3771                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
3772                            )
3773                            window = 1000000
3774                            sql_query_intervals_for_bed = f"""
3775                                SELECT  \"#CHROM\",
3776                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
3777                                        \"POS\"+{window}
3778                                FROM {table_variants} as table_variants
3779                                WHERE table_variants.\"#CHROM\" = '{chrom}'
3780                            """
3781                            regions = self.conn.execute(
3782                                sql_query_intervals_for_bed
3783                            ).fetchall()
3784                            merged_regions = merge_regions(regions)
3785                            log.debug(
3786                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
3787                            )
3788
3789                            header = ["#CHROM", "START", "END"]
3790                            with open(tmp_bed_name, "w") as f:
3791                                # Write the header with tab delimiter
3792                                f.write("\t".join(header) + "\n")
3793                                for d in merged_regions:
3794                                    # Write each data row with tab delimiter
3795                                    f.write("\t".join(map(str, d)) + "\n")
3796
3797                            # Tmp files
3798                            tmp_annotation_vcf = NamedTemporaryFile(
3799                                prefix=self.get_prefix(),
3800                                dir=self.get_tmp_dir(),
3801                                suffix=".vcf.gz",
3802                                delete=False,
3803                            )
3804                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
3805                            tmp_files.append(tmp_annotation_vcf_name)
3806                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
3807                            tmp_annotation_vcf_name_err = (
3808                                tmp_annotation_vcf_name + ".err"
3809                            )
3810                            err_files.append(tmp_annotation_vcf_name_err)
3811
3812                            # Annotate Command
3813                            log.debug(
3814                                f"Annotation '{annotation}' - add bcftools command"
3815                            )
3816
3817                            # Command
3818                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3819
3820                            # Add command
3821                            commands.append(command_annotate)
3822
3823            # if some commands
3824            if commands:
3825
3826                # Export VCF file
3827                self.export_variant_vcf(
3828                    vcf_file=tmp_vcf_name,
3829                    remove_info=True,
3830                    add_samples=False,
3831                    index=True,
3832                )
3833
3834                # Threads
3835                # calculate threads for annotated commands
3836                if commands:
3837                    threads_bcftools_annotate = round(threads / len(commands))
3838                else:
3839                    threads_bcftools_annotate = 1
3840
3841                if not threads_bcftools_annotate:
3842                    threads_bcftools_annotate = 1
3843
3844                # Add threads option to bcftools commands
3845                if threads_bcftools_annotate > 1:
3846                    commands_threaded = []
3847                    for command in commands:
3848                        commands_threaded.append(
3849                            command.replace(
3850                                f"{bcftools_bin_command} annotate ",
3851                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
3852                            )
3853                        )
3854                    commands = commands_threaded
3855
3856                # Command annotation multithreading
3857                log.debug(f"Annotation - Annotation commands: " + str(commands))
3858                log.info(
3859                    f"Annotation - Annotation multithreaded in "
3860                    + str(len(commands))
3861                    + " commands"
3862                )
3863
3864                run_parallel_commands(commands, threads)
3865
3866                # Merge
3867                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
3868
3869                if tmp_ann_vcf_list_cmd:
3870
3871                    # Tmp file
3872                    tmp_annotate_vcf = NamedTemporaryFile(
3873                        prefix=self.get_prefix(),
3874                        dir=self.get_tmp_dir(),
3875                        suffix=".vcf.gz",
3876                        delete=True,
3877                    )
3878                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
3879                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
3880                    err_files.append(tmp_annotate_vcf_name_err)
3881
3882                    # Tmp file remove command
3883                    tmp_files_remove_command = ""
3884                    if tmp_files:
3885                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
3886
3887                    # Command merge
3888                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
3889                    log.info(
3890                        f"Annotation - Annotation merging "
3891                        + str(len(commands))
3892                        + " annotated files"
3893                    )
3894                    log.debug(f"Annotation - merge command: {merge_command}")
3895                    run_parallel_commands([merge_command], 1)
3896
3897                    # Error messages
3898                    log.info(f"Error/Warning messages:")
3899                    error_message_command_all = []
3900                    error_message_command_warning = []
3901                    error_message_command_err = []
3902                    for err_file in err_files:
3903                        with open(err_file, "r") as f:
3904                            for line in f:
3905                                message = line.strip()
3906                                error_message_command_all.append(message)
3907                                if line.startswith("[W::"):
3908                                    error_message_command_warning.append(message)
3909                                if line.startswith("[E::"):
3910                                    error_message_command_err.append(
3911                                        f"{err_file}: " + message
3912                                    )
3913                    # log info
3914                    for message in list(
3915                        set(error_message_command_err + error_message_command_warning)
3916                    ):
3917                        log.info(f"   {message}")
3918                    # debug info
3919                    for message in list(set(error_message_command_all)):
3920                        log.debug(f"   {message}")
3921                    # failed
3922                    if len(error_message_command_err):
3923                        log.error("Annotation failed: Error in commands")
3924                        raise ValueError("Annotation failed: Error in commands")
3925
3926                    # Update variants
3927                    log.info(f"Annotation - Updating...")
3928                    self.update_from_vcf(tmp_annotate_vcf_name)

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_exomiser(self, threads: int = None) -> None:
3930    def annotation_exomiser(self, threads: int = None) -> None:
3931        """
3932        This function annotate with Exomiser
3933
3934        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
3935        - "analysis" (dict/file):
3936            Full analysis dictionnary parameters (see Exomiser docs).
3937            Either a dict, or a file in JSON or YAML format.
3938            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
3939            Default : None
3940        - "preset" (string):
3941            Analysis preset (available in config folder).
3942            Used if no full "analysis" is provided.
3943            Default: "exome"
3944        - "phenopacket" (dict/file):
3945            Samples and phenotipic features parameters (see Exomiser docs).
3946            Either a dict, or a file in JSON or YAML format.
3947            Default: None
3948        - "subject" (dict):
3949            Sample parameters (see Exomiser docs).
3950            Example:
3951                "subject":
3952                    {
3953                        "id": "ISDBM322017",
3954                        "sex": "FEMALE"
3955                    }
3956            Default: None
3957        - "sample" (string):
3958            Sample name to construct "subject" section:
3959                "subject":
3960                    {
3961                        "id": "<sample>",
3962                        "sex": "UNKNOWN_SEX"
3963                    }
3964            Default: None
3965        - "phenotypicFeatures" (dict)
3966            Phenotypic features to construct "subject" section.
3967            Example:
3968                "phenotypicFeatures":
3969                    [
3970                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
3971                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
3972                    ]
3973        - "hpo" (list)
3974            List of HPO ids as phenotypic features.
3975            Example:
3976                "hpo": ['0001156', '0001363', '0011304', '0010055']
3977            Default: []
3978        - "outputOptions" (dict):
3979            Output options (see Exomiser docs).
3980            Default:
3981                "output_options" =
3982                    {
3983                        "outputContributingVariantsOnly": False,
3984                        "numGenes": 0,
3985                        "outputFormats": ["TSV_VARIANT", "VCF"]
3986                    }
3987        - "transcript_source" (string):
3988            Transcript source (either "refseq", "ucsc", "ensembl")
3989            Default: "refseq"
3990        - "exomiser_to_info" (boolean):
3991            Add exomiser TSV file columns as INFO fields in VCF.
3992            Default: False
3993        - "release" (string):
3994            Exomise database release.
3995            If not exists, database release will be downloaded (take a while).
3996            Default: None (provided by application.properties configuration file)
3997        - "exomiser_application_properties" (file):
3998            Exomiser configuration file (see Exomiser docs).
3999            Useful to automatically download databases (especially for specific genome databases).
4000
4001        Notes:
4002        - If no sample in parameters, first sample in VCF will be chosen
4003        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
4004
4005        :param threads: The number of threads to use
4006        :return: None.
4007        """
4008
4009        # DEBUG
4010        log.debug("Start annotation with Exomiser databases")
4011
4012        # Threads
4013        if not threads:
4014            threads = self.get_threads()
4015        log.debug("Threads: " + str(threads))
4016
4017        # Config
4018        config = self.get_config()
4019        log.debug("Config: " + str(config))
4020
4021        # Config - Folders - Databases
4022        databases_folders = (
4023            config.get("folders", {})
4024            .get("databases", {})
4025            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
4026        )
4027        databases_folders = full_path(databases_folders)
4028        if not os.path.exists(databases_folders):
4029            log.error(f"Databases annotations: {databases_folders} NOT found")
4030        log.debug("Databases annotations: " + str(databases_folders))
4031
4032        # Config - Exomiser
4033        exomiser_bin_command = get_bin_command(
4034            bin="exomiser-cli*.jar",
4035            tool="exomiser",
4036            bin_type="jar",
4037            config=config,
4038            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
4039        )
4040        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
4041        if not exomiser_bin_command:
4042            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
4043            log.error(msg_err)
4044            raise ValueError(msg_err)
4045
4046        # Param
4047        param = self.get_param()
4048        log.debug("Param: " + str(param))
4049
4050        # Param - Exomiser
4051        param_exomiser = param.get("annotation", {}).get("exomiser", {})
4052        log.debug(f"Param Exomiser: {param_exomiser}")
4053
4054        # Param - Assembly
4055        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4056        log.debug("Assembly: " + str(assembly))
4057
4058        # Data
4059        table_variants = self.get_table_variants()
4060
4061        # Check if not empty
4062        log.debug("Check if not empty")
4063        sql_query_chromosomes = (
4064            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4065        )
4066        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4067            log.info(f"VCF empty")
4068            return False
4069
4070        # VCF header
4071        vcf_reader = self.get_header()
4072        log.debug("Initial header: " + str(vcf_reader.infos))
4073
4074        # Samples
4075        samples = self.get_header_sample_list()
4076        if not samples:
4077            log.error("No Samples in VCF")
4078            return False
4079        log.debug(f"Samples: {samples}")
4080
4081        # Memory limit
4082        memory_limit = self.get_memory("8G")
4083        log.debug(f"memory_limit: {memory_limit}")
4084
4085        # Exomiser java options
4086        exomiser_java_options = (
4087            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4088        )
4089        log.debug(f"Exomiser java options: {exomiser_java_options}")
4090
4091        # Download Exomiser (if not exists)
4092        exomiser_release = param_exomiser.get("release", None)
4093        exomiser_application_properties = param_exomiser.get(
4094            "exomiser_application_properties", None
4095        )
4096        databases_download_exomiser(
4097            assemblies=[assembly],
4098            exomiser_folder=databases_folders,
4099            exomiser_release=exomiser_release,
4100            exomiser_phenotype_release=exomiser_release,
4101            exomiser_application_properties=exomiser_application_properties,
4102        )
4103
4104        # Force annotation
4105        force_update_annotation = True
4106
4107        if "Exomiser" not in self.get_header().infos or force_update_annotation:
4108            log.debug("Start annotation Exomiser")
4109
4110            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
4111
4112                # tmp_dir = "/tmp/exomiser"
4113
4114                ### ANALYSIS ###
4115                ################
4116
4117                # Create analysis.json through analysis dict
4118                # either analysis in param or by default
4119                # depending on preset exome/genome)
4120
4121                # Init analysis dict
4122                param_exomiser_analysis_dict = {}
4123
4124                # analysis from param
4125                param_exomiser_analysis = param_exomiser.get("analysis", {})
4126                param_exomiser_analysis = full_path(param_exomiser_analysis)
4127
4128                # If analysis in param -> load anlaysis json
4129                if param_exomiser_analysis:
4130
4131                    # If param analysis is a file and exists
4132                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
4133                        param_exomiser_analysis
4134                    ):
4135                        # Load analysis file into analysis dict (either yaml or json)
4136                        with open(param_exomiser_analysis) as json_file:
4137                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
4138
4139                    # If param analysis is a dict
4140                    elif isinstance(param_exomiser_analysis, dict):
4141                        # Load analysis dict into analysis dict (either yaml or json)
4142                        param_exomiser_analysis_dict = param_exomiser_analysis
4143
4144                    # Error analysis type
4145                    else:
4146                        log.error(f"Analysis type unknown. Check param file.")
4147                        raise ValueError(f"Analysis type unknown. Check param file.")
4148
4149                # Case no input analysis config file/dict
4150                # Use preset (exome/genome) to open default config file
4151                if not param_exomiser_analysis_dict:
4152
4153                    # default preset
4154                    default_preset = "exome"
4155
4156                    # Get param preset or default preset
4157                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
4158
4159                    # Try to find if preset is a file
4160                    if os.path.exists(param_exomiser_preset):
4161                        # Preset file is provided in full path
4162                        param_exomiser_analysis_default_config_file = (
4163                            param_exomiser_preset
4164                        )
4165                    # elif os.path.exists(full_path(param_exomiser_preset)):
4166                    #     # Preset file is provided in full path
4167                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
4168                    elif os.path.exists(
4169                        os.path.join(folder_config, param_exomiser_preset)
4170                    ):
4171                        # Preset file is provided a basename in config folder (can be a path with subfolders)
4172                        param_exomiser_analysis_default_config_file = os.path.join(
4173                            folder_config, param_exomiser_preset
4174                        )
4175                    else:
4176                        # Construct preset file
4177                        param_exomiser_analysis_default_config_file = os.path.join(
4178                            folder_config,
4179                            f"preset-{param_exomiser_preset}-analysis.json",
4180                        )
4181
4182                    # If preset file exists
4183                    param_exomiser_analysis_default_config_file = full_path(
4184                        param_exomiser_analysis_default_config_file
4185                    )
4186                    if os.path.exists(param_exomiser_analysis_default_config_file):
4187                        # Load prest file into analysis dict (either yaml or json)
4188                        with open(
4189                            param_exomiser_analysis_default_config_file
4190                        ) as json_file:
4191                            # param_exomiser_analysis_dict[""] = json.load(json_file)
4192                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
4193                                json_file
4194                            )
4195
4196                    # Error preset file
4197                    else:
4198                        log.error(
4199                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4200                        )
4201                        raise ValueError(
4202                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4203                        )
4204
4205                # If no analysis dict created
4206                if not param_exomiser_analysis_dict:
4207                    log.error(f"No analysis config")
4208                    raise ValueError(f"No analysis config")
4209
4210                # Log
4211                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4212
4213                ### PHENOPACKET ###
4214                ###################
4215
4216                # If no PhenoPacket in analysis dict -> check in param
4217                if "phenopacket" not in param_exomiser_analysis_dict:
4218
4219                    # If PhenoPacket in param -> load anlaysis json
4220                    if param_exomiser.get("phenopacket", None):
4221
4222                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
4223                        param_exomiser_phenopacket = full_path(
4224                            param_exomiser_phenopacket
4225                        )
4226
4227                        # If param phenopacket is a file and exists
4228                        if isinstance(
4229                            param_exomiser_phenopacket, str
4230                        ) and os.path.exists(param_exomiser_phenopacket):
4231                            # Load phenopacket file into analysis dict (either yaml or json)
4232                            with open(param_exomiser_phenopacket) as json_file:
4233                                param_exomiser_analysis_dict["phenopacket"] = (
4234                                    yaml.safe_load(json_file)
4235                                )
4236
4237                        # If param phenopacket is a dict
4238                        elif isinstance(param_exomiser_phenopacket, dict):
4239                            # Load phenopacket dict into analysis dict (either yaml or json)
4240                            param_exomiser_analysis_dict["phenopacket"] = (
4241                                param_exomiser_phenopacket
4242                            )
4243
4244                        # Error phenopacket type
4245                        else:
4246                            log.error(f"Phenopacket type unknown. Check param file.")
4247                            raise ValueError(
4248                                f"Phenopacket type unknown. Check param file."
4249                            )
4250
4251                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
4252                if "phenopacket" not in param_exomiser_analysis_dict:
4253
4254                    # Init PhenoPacket
4255                    param_exomiser_analysis_dict["phenopacket"] = {
4256                        "id": "analysis",
4257                        "proband": {},
4258                    }
4259
4260                    ### Add subject ###
4261
4262                    # If subject exists
4263                    param_exomiser_subject = param_exomiser.get("subject", {})
4264
4265                    # If subject not exists -> found sample ID
4266                    if not param_exomiser_subject:
4267
4268                        # Found sample ID in param
4269                        sample = param_exomiser.get("sample", None)
4270
4271                        # Find sample ID (first sample)
4272                        if not sample:
4273                            sample_list = self.get_header_sample_list()
4274                            if len(sample_list) > 0:
4275                                sample = sample_list[0]
4276                            else:
4277                                log.error(f"No sample found")
4278                                raise ValueError(f"No sample found")
4279
4280                        # Create subject
4281                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
4282
4283                    # Add to dict
4284                    param_exomiser_analysis_dict["phenopacket"][
4285                        "subject"
4286                    ] = param_exomiser_subject
4287
4288                    ### Add "phenotypicFeatures" ###
4289
4290                    # If phenotypicFeatures exists
4291                    param_exomiser_phenotypicfeatures = param_exomiser.get(
4292                        "phenotypicFeatures", []
4293                    )
4294
4295                    # If phenotypicFeatures not exists -> Try to infer from hpo list
4296                    if not param_exomiser_phenotypicfeatures:
4297
4298                        # Found HPO in param
4299                        param_exomiser_hpo = param_exomiser.get("hpo", [])
4300
4301                        # Split HPO if list in string format separated by comma
4302                        if isinstance(param_exomiser_hpo, str):
4303                            param_exomiser_hpo = param_exomiser_hpo.split(",")
4304
4305                        # Create HPO list
4306                        for hpo in param_exomiser_hpo:
4307                            hpo_clean = re.sub("[^0-9]", "", hpo)
4308                            param_exomiser_phenotypicfeatures.append(
4309                                {
4310                                    "type": {
4311                                        "id": f"HP:{hpo_clean}",
4312                                        "label": f"HP:{hpo_clean}",
4313                                    }
4314                                }
4315                            )
4316
4317                    # Add to dict
4318                    param_exomiser_analysis_dict["phenopacket"][
4319                        "phenotypicFeatures"
4320                    ] = param_exomiser_phenotypicfeatures
4321
4322                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
4323                    if not param_exomiser_phenotypicfeatures:
4324                        for step in param_exomiser_analysis_dict.get(
4325                            "analysis", {}
4326                        ).get("steps", []):
4327                            if "hiPhivePrioritiser" in step:
4328                                param_exomiser_analysis_dict.get("analysis", {}).get(
4329                                    "steps", []
4330                                ).remove(step)
4331
4332                ### Add Input File ###
4333
4334                # Initial file name and htsFiles
4335                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
4336                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
4337                    {
4338                        "uri": tmp_vcf_name,
4339                        "htsFormat": "VCF",
4340                        "genomeAssembly": assembly,
4341                    }
4342                ]
4343
4344                ### Add metaData ###
4345
4346                # If metaData not in analysis dict
4347                if "metaData" not in param_exomiser_analysis_dict:
4348                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
4349                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
4350                        "createdBy": "howard",
4351                        "phenopacketSchemaVersion": 1,
4352                    }
4353
4354                ### OutputOptions ###
4355
4356                # Init output result folder
4357                output_results = os.path.join(tmp_dir, "results")
4358
4359                # If no outputOptions in analysis dict
4360                if "outputOptions" not in param_exomiser_analysis_dict:
4361
4362                    # default output formats
4363                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
4364
4365                    # Get outputOptions in param
4366                    output_options = param_exomiser.get("outputOptions", None)
4367
4368                    # If no output_options in param -> check
4369                    if not output_options:
4370                        output_options = {
4371                            "outputContributingVariantsOnly": False,
4372                            "numGenes": 0,
4373                            "outputFormats": defaut_output_formats,
4374                        }
4375
4376                    # Replace outputDirectory in output options
4377                    output_options["outputDirectory"] = output_results
4378                    output_options["outputFileName"] = "howard"
4379
4380                    # Add outputOptions in analysis dict
4381                    param_exomiser_analysis_dict["outputOptions"] = output_options
4382
4383                else:
4384
4385                    # Replace output_results and output format (if exists in param)
4386                    param_exomiser_analysis_dict["outputOptions"][
4387                        "outputDirectory"
4388                    ] = output_results
4389                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
4390                        list(
4391                            set(
4392                                param_exomiser_analysis_dict.get(
4393                                    "outputOptions", {}
4394                                ).get("outputFormats", [])
4395                                + ["TSV_VARIANT", "VCF"]
4396                            )
4397                        )
4398                    )
4399
4400                # log
4401                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4402
4403                ### ANALYSIS FILE ###
4404                #####################
4405
4406                ### Full JSON analysis config file ###
4407
4408                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
4409                with open(exomiser_analysis, "w") as fp:
4410                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
4411
4412                ### SPLIT analysis and sample config files
4413
4414                # Splitted analysis dict
4415                param_exomiser_analysis_dict_for_split = (
4416                    param_exomiser_analysis_dict.copy()
4417                )
4418
4419                # Phenopacket JSON file
4420                exomiser_analysis_phenopacket = os.path.join(
4421                    tmp_dir, "analysis_phenopacket.json"
4422                )
4423                with open(exomiser_analysis_phenopacket, "w") as fp:
4424                    json.dump(
4425                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
4426                        fp,
4427                        indent=4,
4428                    )
4429
4430                # Analysis JSON file without Phenopacket parameters
4431                param_exomiser_analysis_dict_for_split.pop("phenopacket")
4432                exomiser_analysis_analysis = os.path.join(
4433                    tmp_dir, "analysis_analysis.json"
4434                )
4435                with open(exomiser_analysis_analysis, "w") as fp:
4436                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
4437
4438                ### INITAL VCF file ###
4439                #######################
4440
4441                ### Create list of samples to use and include inti initial VCF file ####
4442
4443                # Subject (main sample)
4444                # Get sample ID in analysis dict
4445                sample_subject = (
4446                    param_exomiser_analysis_dict.get("phenopacket", {})
4447                    .get("subject", {})
4448                    .get("id", None)
4449                )
4450                sample_proband = (
4451                    param_exomiser_analysis_dict.get("phenopacket", {})
4452                    .get("proband", {})
4453                    .get("subject", {})
4454                    .get("id", None)
4455                )
4456                sample = []
4457                if sample_subject:
4458                    sample.append(sample_subject)
4459                if sample_proband:
4460                    sample.append(sample_proband)
4461
4462                # Get sample ID within Pedigree
4463                pedigree_persons_list = (
4464                    param_exomiser_analysis_dict.get("phenopacket", {})
4465                    .get("pedigree", {})
4466                    .get("persons", {})
4467                )
4468
4469                # Create list with all sample ID in pedigree (if exists)
4470                pedigree_persons = []
4471                for person in pedigree_persons_list:
4472                    pedigree_persons.append(person.get("individualId"))
4473
4474                # Concat subject sample ID and samples ID in pedigreesamples
4475                samples = list(set(sample + pedigree_persons))
4476
4477                # Check if sample list is not empty
4478                if not samples:
4479                    log.error(f"No samples found")
4480                    raise ValueError(f"No samples found")
4481
4482                # Create VCF with sample (either sample in param or first one by default)
4483                # Export VCF file
4484                self.export_variant_vcf(
4485                    vcf_file=tmp_vcf_name,
4486                    remove_info=True,
4487                    add_samples=True,
4488                    list_samples=samples,
4489                    index=False,
4490                )
4491
4492                ### Execute Exomiser ###
4493                ########################
4494
4495                # Init command
4496                exomiser_command = ""
4497
4498                # Command exomiser options
4499                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
4500
4501                # Release
4502                exomiser_release = param_exomiser.get("release", None)
4503                if exomiser_release:
4504                    # phenotype data version
4505                    exomiser_options += (
4506                        f" --exomiser.phenotype.data-version={exomiser_release} "
4507                    )
4508                    # data version
4509                    exomiser_options += (
4510                        f" --exomiser.{assembly}.data-version={exomiser_release} "
4511                    )
4512                    # variant white list
4513                    variant_white_list_file = (
4514                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
4515                    )
4516                    if os.path.exists(
4517                        os.path.join(
4518                            databases_folders, assembly, variant_white_list_file
4519                        )
4520                    ):
4521                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
4522
4523                # transcript_source
4524                transcript_source = param_exomiser.get(
4525                    "transcript_source", None
4526                )  # ucsc, refseq, ensembl
4527                if transcript_source:
4528                    exomiser_options += (
4529                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
4530                    )
4531
4532                # If analysis contain proband param
4533                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
4534                    "proband", {}
4535                ):
4536                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
4537
4538                # If no proband (usually uniq sample)
4539                else:
4540                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
4541
4542                # Log
4543                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
4544
4545                # Run command
4546                result = subprocess.call(
4547                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
4548                )
4549                if result:
4550                    log.error("Exomiser command failed")
4551                    raise ValueError("Exomiser command failed")
4552
4553                ### RESULTS ###
4554                ###############
4555
4556                ### Annotate with TSV fields ###
4557
4558                # Init result tsv file
4559                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
4560
4561                # Init result tsv file
4562                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
4563
4564                # Parse TSV file and explode columns in INFO field
4565                if exomiser_to_info and os.path.exists(output_results_tsv):
4566
4567                    # Log
4568                    log.debug("Exomiser columns to VCF INFO field")
4569
4570                    # Retrieve columns and types
4571                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
4572                    output_results_tsv_df = self.get_query_to_df(query)
4573                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
4574
4575                    # Init concat fields for update
4576                    sql_query_update_concat_fields = []
4577
4578                    # Fields to avoid
4579                    fields_to_avoid = [
4580                        "CONTIG",
4581                        "START",
4582                        "END",
4583                        "REF",
4584                        "ALT",
4585                        "QUAL",
4586                        "FILTER",
4587                        "GENOTYPE",
4588                    ]
4589
4590                    # List all columns to add into header
4591                    for header_column in output_results_tsv_columns:
4592
4593                        # If header column is enable
4594                        if header_column not in fields_to_avoid:
4595
4596                            # Header info type
4597                            header_info_type = "String"
4598                            header_column_df = output_results_tsv_df[header_column]
4599                            header_column_df_dtype = header_column_df.dtype
4600                            if header_column_df_dtype == object:
4601                                if (
4602                                    pd.to_numeric(header_column_df, errors="coerce")
4603                                    .notnull()
4604                                    .all()
4605                                ):
4606                                    header_info_type = "Float"
4607                            else:
4608                                header_info_type = "Integer"
4609
4610                            # Header info
4611                            characters_to_validate = ["-"]
4612                            pattern = "[" + "".join(characters_to_validate) + "]"
4613                            header_info_name = re.sub(
4614                                pattern,
4615                                "_",
4616                                f"Exomiser_{header_column}".replace("#", ""),
4617                            )
4618                            header_info_number = "."
4619                            header_info_description = (
4620                                f"Exomiser {header_column} annotation"
4621                            )
4622                            header_info_source = "Exomiser"
4623                            header_info_version = "unknown"
4624                            header_info_code = CODE_TYPE_MAP[header_info_type]
4625                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
4626                                header_info_name,
4627                                header_info_number,
4628                                header_info_type,
4629                                header_info_description,
4630                                header_info_source,
4631                                header_info_version,
4632                                header_info_code,
4633                            )
4634
4635                            # Add field to add for update to concat fields
4636                            sql_query_update_concat_fields.append(
4637                                f"""
4638                                CASE
4639                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
4640                                    THEN concat(
4641                                        '{header_info_name}=',
4642                                        table_parquet."{header_column}",
4643                                        ';'
4644                                        )
4645
4646                                    ELSE ''
4647                                END
4648                            """
4649                            )
4650
4651                    # Update query
4652                    sql_query_update = f"""
4653                        UPDATE {table_variants} as table_variants
4654                            SET INFO = concat(
4655                                            CASE
4656                                                WHEN INFO NOT IN ('', '.')
4657                                                THEN INFO
4658                                                ELSE ''
4659                                            END,
4660                                            CASE
4661                                                WHEN table_variants.INFO NOT IN ('','.')
4662                                                THEN ';'
4663                                                ELSE ''
4664                                            END,
4665                                            (
4666                                            SELECT 
4667                                                concat(
4668                                                    {",".join(sql_query_update_concat_fields)}
4669                                                )
4670                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
4671                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
4672                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
4673                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
4674                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
4675                                            )
4676                                        )
4677                            ;
4678                        """
4679
4680                    # Update
4681                    self.conn.execute(sql_query_update)
4682
4683                ### Annotate with VCF INFO field ###
4684
4685                # Init result VCF file
4686                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
4687
4688                # If VCF exists
4689                if os.path.exists(output_results_vcf):
4690
4691                    # Log
4692                    log.debug("Exomiser result VCF update variants")
4693
4694                    # Find Exomiser INFO field annotation in header
4695                    with gzip.open(output_results_vcf, "rt") as f:
4696                        header_list = self.read_vcf_header(f)
4697                    exomiser_vcf_header = vcf.Reader(
4698                        io.StringIO("\n".join(header_list))
4699                    )
4700
4701                    # Add annotation INFO field to header
4702                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
4703
4704                    # Update variants with VCF
4705                    self.update_from_vcf(output_results_vcf)
4706
4707        return True

This function annotate with Exomiser

This function uses args as parameters, in section "annotation" -> "exomiser", with sections:

  • "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
  • "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
  • "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
  • "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
  • "sample" (string): Sample name to construct "subject" section: "subject": { "id": "", "sex": "UNKNOWN_SEX" } Default: None
  • "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
  • "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
  • "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
  • "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
  • "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
  • "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
  • "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).

Notes:

  • If no sample in parameters, first sample in VCF will be chosen
  • If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
  • threads: The number of threads to use
Returns

None.

def annotation_snpeff(self, threads: int = None) -> None:
4709    def annotation_snpeff(self, threads: int = None) -> None:
4710        """
4711        This function annotate with snpEff
4712
4713        :param threads: The number of threads to use
4714        :return: the value of the variable "return_value".
4715        """
4716
4717        # DEBUG
4718        log.debug("Start annotation with snpeff databases")
4719
4720        # Threads
4721        if not threads:
4722            threads = self.get_threads()
4723        log.debug("Threads: " + str(threads))
4724
4725        # DEBUG
4726        delete_tmp = True
4727        if self.get_config().get("verbosity", "warning") in ["debug"]:
4728            delete_tmp = False
4729            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4730
4731        # Config
4732        config = self.get_config()
4733        log.debug("Config: " + str(config))
4734
4735        # Config - Folders - Databases
4736        databases_folders = (
4737            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
4738        )
4739        log.debug("Databases annotations: " + str(databases_folders))
4740
4741        # # Config - Java
4742        # java_bin = get_bin(
4743        #     tool="java",
4744        #     bin="java",
4745        #     bin_type="bin",
4746        #     config=config,
4747        #     default_folder="/usr/bin",
4748        # )
4749        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
4750        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
4751        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
4752
4753        # # Config - snpEff bin
4754        # snpeff_jar = get_bin(
4755        #     tool="snpeff",
4756        #     bin="snpEff.jar",
4757        #     bin_type="jar",
4758        #     config=config,
4759        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4760        # )
4761        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
4762        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4763        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4764
4765        # Config - snpEff bin command
4766        snpeff_bin_command = get_bin_command(
4767            bin="snpEff.jar",
4768            tool="snpeff",
4769            bin_type="jar",
4770            config=config,
4771            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4772        )
4773        if not snpeff_bin_command:
4774            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
4775            log.error(msg_err)
4776            raise ValueError(msg_err)
4777
4778        # Config - snpEff databases
4779        snpeff_databases = (
4780            config.get("folders", {})
4781            .get("databases", {})
4782            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
4783        )
4784        snpeff_databases = full_path(snpeff_databases)
4785        if snpeff_databases is not None and snpeff_databases != "":
4786            log.debug(f"Create snpEff databases folder")
4787            if not os.path.exists(snpeff_databases):
4788                os.makedirs(snpeff_databases)
4789
4790        # Param
4791        param = self.get_param()
4792        log.debug("Param: " + str(param))
4793
4794        # Param
4795        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
4796        log.debug("Options: " + str(options))
4797
4798        # Param - Assembly
4799        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4800
4801        # Param - Options
4802        snpeff_options = (
4803            param.get("annotation", {}).get("snpeff", {}).get("options", "")
4804        )
4805        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
4806        snpeff_csvstats = (
4807            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
4808        )
4809        if snpeff_stats:
4810            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
4811            snpeff_stats = full_path(snpeff_stats)
4812            snpeff_options += f" -stats {snpeff_stats}"
4813        if snpeff_csvstats:
4814            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
4815            snpeff_csvstats = full_path(snpeff_csvstats)
4816            snpeff_options += f" -csvStats {snpeff_csvstats}"
4817
4818        # Data
4819        table_variants = self.get_table_variants()
4820
4821        # Check if not empty
4822        log.debug("Check if not empty")
4823        sql_query_chromosomes = (
4824            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4825        )
4826        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
4827        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4828            log.info(f"VCF empty")
4829            return
4830
4831        # Export in VCF
4832        log.debug("Create initial file to annotate")
4833        tmp_vcf = NamedTemporaryFile(
4834            prefix=self.get_prefix(),
4835            dir=self.get_tmp_dir(),
4836            suffix=".vcf.gz",
4837            delete=True,
4838        )
4839        tmp_vcf_name = tmp_vcf.name
4840
4841        # VCF header
4842        vcf_reader = self.get_header()
4843        log.debug("Initial header: " + str(vcf_reader.infos))
4844
4845        # Existing annotations
4846        for vcf_annotation in self.get_header().infos:
4847
4848            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4849            log.debug(
4850                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4851            )
4852
4853        # Memory limit
4854        # if config.get("memory", None):
4855        #     memory_limit = config.get("memory", "8G")
4856        # else:
4857        #     memory_limit = "8G"
4858        memory_limit = self.get_memory("8G")
4859        log.debug(f"memory_limit: {memory_limit}")
4860
4861        # snpEff java options
4862        snpeff_java_options = (
4863            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4864        )
4865        log.debug(f"Exomiser java options: {snpeff_java_options}")
4866
4867        force_update_annotation = True
4868
4869        if "ANN" not in self.get_header().infos or force_update_annotation:
4870
4871            # Check snpEff database
4872            log.debug(f"Check snpEff databases {[assembly]}")
4873            databases_download_snpeff(
4874                folder=snpeff_databases, assemblies=[assembly], config=config
4875            )
4876
4877            # Export VCF file
4878            self.export_variant_vcf(
4879                vcf_file=tmp_vcf_name,
4880                remove_info=True,
4881                add_samples=False,
4882                index=True,
4883            )
4884
4885            # Tmp file
4886            err_files = []
4887            tmp_annotate_vcf = NamedTemporaryFile(
4888                prefix=self.get_prefix(),
4889                dir=self.get_tmp_dir(),
4890                suffix=".vcf",
4891                delete=False,
4892            )
4893            tmp_annotate_vcf_name = tmp_annotate_vcf.name
4894            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
4895            err_files.append(tmp_annotate_vcf_name_err)
4896
4897            # Command
4898            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
4899            log.debug(f"Annotation - snpEff command: {snpeff_command}")
4900            run_parallel_commands([snpeff_command], 1)
4901
4902            # Error messages
4903            log.info(f"Error/Warning messages:")
4904            error_message_command_all = []
4905            error_message_command_warning = []
4906            error_message_command_err = []
4907            for err_file in err_files:
4908                with open(err_file, "r") as f:
4909                    for line in f:
4910                        message = line.strip()
4911                        error_message_command_all.append(message)
4912                        if line.startswith("[W::"):
4913                            error_message_command_warning.append(message)
4914                        if line.startswith("[E::"):
4915                            error_message_command_err.append(f"{err_file}: " + message)
4916            # log info
4917            for message in list(
4918                set(error_message_command_err + error_message_command_warning)
4919            ):
4920                log.info(f"   {message}")
4921            # debug info
4922            for message in list(set(error_message_command_all)):
4923                log.debug(f"   {message}")
4924            # failed
4925            if len(error_message_command_err):
4926                log.error("Annotation failed: Error in commands")
4927                raise ValueError("Annotation failed: Error in commands")
4928
4929            # Find annotation in header
4930            with open(tmp_annotate_vcf_name, "rt") as f:
4931                header_list = self.read_vcf_header(f)
4932            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
4933
4934            for ann in annovar_vcf_header.infos:
4935                if ann not in self.get_header().infos:
4936                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
4937
4938            # Update variants
4939            log.info(f"Annotation - Updating...")
4940            self.update_from_vcf(tmp_annotate_vcf_name)
4941
4942        else:
4943            if "ANN" in self.get_header().infos:
4944                log.debug(f"Existing snpEff annotations in VCF")
4945            if force_update_annotation:
4946                log.debug(f"Existing snpEff annotations in VCF - annotation forced")

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def annotation_annovar(self, threads: int = None) -> None:
4948    def annotation_annovar(self, threads: int = None) -> None:
4949        """
4950        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
4951        annotations
4952
4953        :param threads: number of threads to use
4954        :return: the value of the variable "return_value".
4955        """
4956
4957        # DEBUG
4958        log.debug("Start annotation with Annovar databases")
4959
4960        # Threads
4961        if not threads:
4962            threads = self.get_threads()
4963        log.debug("Threads: " + str(threads))
4964
4965        # Tmp en Err files
4966        tmp_files = []
4967        err_files = []
4968
4969        # DEBUG
4970        delete_tmp = True
4971        if self.get_config().get("verbosity", "warning") in ["debug"]:
4972            delete_tmp = False
4973            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4974
4975        # Config
4976        config = self.get_config()
4977        log.debug("Config: " + str(config))
4978
4979        # Config - Folders - Databases
4980        databases_folders = (
4981            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
4982        )
4983        log.debug("Databases annotations: " + str(databases_folders))
4984
4985        # Config - annovar bin command
4986        annovar_bin_command = get_bin_command(
4987            bin="table_annovar.pl",
4988            tool="annovar",
4989            bin_type="perl",
4990            config=config,
4991            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
4992        )
4993        if not annovar_bin_command:
4994            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
4995            log.error(msg_err)
4996            raise ValueError(msg_err)
4997
4998        # Config - BCFTools bin command
4999        bcftools_bin_command = get_bin_command(
5000            bin="bcftools",
5001            tool="bcftools",
5002            bin_type="bin",
5003            config=config,
5004            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
5005        )
5006        if not bcftools_bin_command:
5007            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
5008            log.error(msg_err)
5009            raise ValueError(msg_err)
5010
5011        # Config - annovar databases
5012        annovar_databases = (
5013            config.get("folders", {})
5014            .get("databases", {})
5015            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
5016        )
5017        annovar_databases = full_path(annovar_databases)
5018        if annovar_databases != "" and not os.path.exists(annovar_databases):
5019            os.makedirs(annovar_databases)
5020
5021        # Param
5022        param = self.get_param()
5023        log.debug("Param: " + str(param))
5024
5025        # Param - options
5026        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
5027        log.debug("Options: " + str(options))
5028
5029        # Param - annotations
5030        annotations = (
5031            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
5032        )
5033        log.debug("Annotations: " + str(annotations))
5034
5035        # Param - Assembly
5036        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5037
5038        # Annovar database assembly
5039        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
5040        if annovar_databases_assembly != "" and not os.path.exists(
5041            annovar_databases_assembly
5042        ):
5043            os.makedirs(annovar_databases_assembly)
5044
5045        # Data
5046        table_variants = self.get_table_variants()
5047
5048        # Check if not empty
5049        log.debug("Check if not empty")
5050        sql_query_chromosomes = (
5051            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5052        )
5053        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
5054        if not sql_query_chromosomes_df["count"][0]:
5055            log.info(f"VCF empty")
5056            return
5057
5058        # VCF header
5059        vcf_reader = self.get_header()
5060        log.debug("Initial header: " + str(vcf_reader.infos))
5061
5062        # Existing annotations
5063        for vcf_annotation in self.get_header().infos:
5064
5065            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5066            log.debug(
5067                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5068            )
5069
5070        force_update_annotation = True
5071
5072        if annotations:
5073
5074            commands = []
5075            tmp_annotates_vcf_name_list = []
5076
5077            # Export in VCF
5078            log.debug("Create initial file to annotate")
5079            tmp_vcf = NamedTemporaryFile(
5080                prefix=self.get_prefix(),
5081                dir=self.get_tmp_dir(),
5082                suffix=".vcf.gz",
5083                delete=False,
5084            )
5085            tmp_vcf_name = tmp_vcf.name
5086            tmp_files.append(tmp_vcf_name)
5087            tmp_files.append(tmp_vcf_name + ".tbi")
5088
5089            # Export VCF file
5090            self.export_variant_vcf(
5091                vcf_file=tmp_vcf_name,
5092                remove_info=".",
5093                add_samples=False,
5094                index=True,
5095            )
5096
5097            # Create file for field rename
5098            log.debug("Create file for field rename")
5099            tmp_rename = NamedTemporaryFile(
5100                prefix=self.get_prefix(),
5101                dir=self.get_tmp_dir(),
5102                suffix=".rename",
5103                delete=False,
5104            )
5105            tmp_rename_name = tmp_rename.name
5106            tmp_files.append(tmp_rename_name)
5107
5108            # Check Annovar database
5109            log.debug(
5110                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
5111            )
5112            databases_download_annovar(
5113                folder=annovar_databases,
5114                files=list(annotations.keys()),
5115                assemblies=[assembly],
5116            )
5117
5118            for annotation in annotations:
5119                annotation_fields = annotations[annotation]
5120
5121                if not annotation_fields:
5122                    annotation_fields = {"INFO": None}
5123
5124                log.info(f"Annotations Annovar - database '{annotation}'")
5125                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
5126
5127                # Tmp file for annovar
5128                err_files = []
5129                tmp_annotate_vcf_directory = TemporaryDirectory(
5130                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
5131                )
5132                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
5133                tmp_annotate_vcf_name_annovar = (
5134                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
5135                )
5136                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
5137                err_files.append(tmp_annotate_vcf_name_err)
5138                tmp_files.append(tmp_annotate_vcf_name_err)
5139
5140                # Tmp file final vcf annotated by annovar
5141                tmp_annotate_vcf = NamedTemporaryFile(
5142                    prefix=self.get_prefix(),
5143                    dir=self.get_tmp_dir(),
5144                    suffix=".vcf.gz",
5145                    delete=False,
5146                )
5147                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5148                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
5149                tmp_files.append(tmp_annotate_vcf_name)
5150                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
5151
5152                # Number of fields
5153                annotation_list = []
5154                annotation_renamed_list = []
5155
5156                for annotation_field in annotation_fields:
5157
5158                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
5159                    annotation_fields_new_name = annotation_fields.get(
5160                        annotation_field, annotation_field
5161                    )
5162                    if not annotation_fields_new_name:
5163                        annotation_fields_new_name = annotation_field
5164
5165                    if (
5166                        force_update_annotation
5167                        or annotation_fields_new_name not in self.get_header().infos
5168                    ):
5169                        annotation_list.append(annotation_field)
5170                        annotation_renamed_list.append(annotation_fields_new_name)
5171                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
5172                        log.warning(
5173                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
5174                        )
5175
5176                    # Add rename info
5177                    run_parallel_commands(
5178                        [
5179                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
5180                        ],
5181                        1,
5182                    )
5183
5184                # log.debug("fields_to_removed: " + str(fields_to_removed))
5185                log.debug("annotation_list: " + str(annotation_list))
5186
5187                # protocol
5188                protocol = annotation
5189
5190                # argument
5191                argument = ""
5192
5193                # operation
5194                operation = "f"
5195                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
5196                    "ensGene"
5197                ):
5198                    operation = "g"
5199                    if options.get("genebase", None):
5200                        argument = f"""'{options.get("genebase","")}'"""
5201                elif annotation in ["cytoBand"]:
5202                    operation = "r"
5203
5204                # argument option
5205                argument_option = ""
5206                if argument != "":
5207                    argument_option = " --argument " + argument
5208
5209                # command options
5210                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
5211                for option in options:
5212                    if option not in ["genebase"]:
5213                        command_options += f""" --{option}={options[option]}"""
5214
5215                # Command
5216
5217                # Command - Annovar
5218                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
5219                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
5220
5221                # Command - start pipe
5222                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
5223
5224                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
5225                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
5226
5227                # Command - Special characters (refGene annotation)
5228                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
5229
5230                # Command - Clean empty fields (with value ".")
5231                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
5232
5233                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
5234                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
5235                if "ALL" not in annotation_list and "INFO" not in annotation_list:
5236                    # for ann in annotation_renamed_list:
5237                    for ann in annotation_list:
5238                        annovar_fields_to_keep.append(f"^INFO/{ann}")
5239
5240                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
5241
5242                # Command - indexing
5243                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
5244
5245                log.debug(f"Annotation - Annovar command: {command_annovar}")
5246                run_parallel_commands([command_annovar], 1)
5247
5248                # Error messages
5249                log.info(f"Error/Warning messages:")
5250                error_message_command_all = []
5251                error_message_command_warning = []
5252                error_message_command_err = []
5253                for err_file in err_files:
5254                    with open(err_file, "r") as f:
5255                        for line in f:
5256                            message = line.strip()
5257                            error_message_command_all.append(message)
5258                            if line.startswith("[W::") or line.startswith("WARNING"):
5259                                error_message_command_warning.append(message)
5260                            if line.startswith("[E::") or line.startswith("ERROR"):
5261                                error_message_command_err.append(
5262                                    f"{err_file}: " + message
5263                                )
5264                # log info
5265                for message in list(
5266                    set(error_message_command_err + error_message_command_warning)
5267                ):
5268                    log.info(f"   {message}")
5269                # debug info
5270                for message in list(set(error_message_command_all)):
5271                    log.debug(f"   {message}")
5272                # failed
5273                if len(error_message_command_err):
5274                    log.error("Annotation failed: Error in commands")
5275                    raise ValueError("Annotation failed: Error in commands")
5276
5277            if tmp_annotates_vcf_name_list:
5278
5279                # List of annotated files
5280                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
5281
5282                # Tmp file
5283                tmp_annotate_vcf = NamedTemporaryFile(
5284                    prefix=self.get_prefix(),
5285                    dir=self.get_tmp_dir(),
5286                    suffix=".vcf.gz",
5287                    delete=False,
5288                )
5289                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5290                tmp_files.append(tmp_annotate_vcf_name)
5291                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5292                err_files.append(tmp_annotate_vcf_name_err)
5293                tmp_files.append(tmp_annotate_vcf_name_err)
5294
5295                # Command merge
5296                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
5297                log.info(
5298                    f"Annotation Annovar - Annotation merging "
5299                    + str(len(tmp_annotates_vcf_name_list))
5300                    + " annotated files"
5301                )
5302                log.debug(f"Annotation - merge command: {merge_command}")
5303                run_parallel_commands([merge_command], 1)
5304
5305                # Find annotation in header
5306                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
5307                    header_list = self.read_vcf_header(f)
5308                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5309
5310                for ann in annovar_vcf_header.infos:
5311                    if ann not in self.get_header().infos:
5312                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5313
5314                # Update variants
5315                log.info(f"Annotation Annovar - Updating...")
5316                self.update_from_vcf(tmp_annotate_vcf_name)
5317
5318            # Clean files
5319            # Tmp file remove command
5320            if True:
5321                tmp_files_remove_command = ""
5322                if tmp_files:
5323                    tmp_files_remove_command = " ".join(tmp_files)
5324                clean_command = f" rm -f {tmp_files_remove_command} "
5325                log.debug(f"Annotation Annovar - Annotation cleaning ")
5326                log.debug(f"Annotation - cleaning command: {clean_command}")
5327                run_parallel_commands([clean_command], 1)

It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations

Parameters
  • threads: number of threads to use
Returns

the value of the variable "return_value".

def annotation_parquet(self, threads: int = None) -> None:
5330    def annotation_parquet(self, threads: int = None) -> None:
5331        """
5332        It takes a VCF file, and annotates it with a parquet file
5333
5334        :param threads: number of threads to use for the annotation
5335        :return: the value of the variable "result".
5336        """
5337
5338        # DEBUG
5339        log.debug("Start annotation with parquet databases")
5340
5341        # Threads
5342        if not threads:
5343            threads = self.get_threads()
5344        log.debug("Threads: " + str(threads))
5345
5346        # DEBUG
5347        delete_tmp = True
5348        if self.get_config().get("verbosity", "warning") in ["debug"]:
5349            delete_tmp = False
5350            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5351
5352        # Config
5353        databases_folders = set(
5354            self.get_config()
5355            .get("folders", {})
5356            .get("databases", {})
5357            .get("annotations", ["."])
5358            + self.get_config()
5359            .get("folders", {})
5360            .get("databases", {})
5361            .get("parquet", ["."])
5362        )
5363        log.debug("Databases annotations: " + str(databases_folders))
5364
5365        # Param
5366        annotations = (
5367            self.get_param()
5368            .get("annotation", {})
5369            .get("parquet", {})
5370            .get("annotations", None)
5371        )
5372        log.debug("Annotations: " + str(annotations))
5373
5374        # Assembly
5375        assembly = self.get_param().get(
5376            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
5377        )
5378
5379        # Force Update Annotation
5380        force_update_annotation = (
5381            self.get_param()
5382            .get("annotation", {})
5383            .get("options", {})
5384            .get("annotations_update", False)
5385        )
5386        log.debug(f"force_update_annotation={force_update_annotation}")
5387        force_append_annotation = (
5388            self.get_param()
5389            .get("annotation", {})
5390            .get("options", {})
5391            .get("annotations_append", False)
5392        )
5393        log.debug(f"force_append_annotation={force_append_annotation}")
5394
5395        # Data
5396        table_variants = self.get_table_variants()
5397
5398        # Check if not empty
5399        log.debug("Check if not empty")
5400        sql_query_chromosomes_df = self.get_query_to_df(
5401            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
5402        )
5403        if not sql_query_chromosomes_df["count"][0]:
5404            log.info(f"VCF empty")
5405            return
5406
5407        # VCF header
5408        vcf_reader = self.get_header()
5409        log.debug("Initial header: " + str(vcf_reader.infos))
5410
5411        # Nb Variants POS
5412        log.debug("NB Variants Start")
5413        nb_variants = self.conn.execute(
5414            f"SELECT count(*) AS count FROM variants"
5415        ).fetchdf()["count"][0]
5416        log.debug("NB Variants Stop")
5417
5418        # Existing annotations
5419        for vcf_annotation in self.get_header().infos:
5420
5421            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5422            log.debug(
5423                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5424            )
5425
5426        # Added columns
5427        added_columns = []
5428
5429        # drop indexes
5430        log.debug(f"Drop indexes...")
5431        self.drop_indexes()
5432
5433        if annotations:
5434
5435            if "ALL" in annotations:
5436
5437                all_param = annotations.get("ALL", {})
5438                all_param_formats = all_param.get("formats", None)
5439                all_param_releases = all_param.get("releases", None)
5440
5441                databases_infos_dict = self.scan_databases(
5442                    database_formats=all_param_formats,
5443                    database_releases=all_param_releases,
5444                )
5445                for database_infos in databases_infos_dict.keys():
5446                    if database_infos not in annotations:
5447                        annotations[database_infos] = {"INFO": None}
5448
5449            for annotation in annotations:
5450
5451                if annotation in ["ALL"]:
5452                    continue
5453
5454                # Annotation Name
5455                annotation_name = os.path.basename(annotation)
5456
5457                # Annotation fields
5458                annotation_fields = annotations[annotation]
5459                if not annotation_fields:
5460                    annotation_fields = {"INFO": None}
5461
5462                log.debug(f"Annotation '{annotation_name}'")
5463                log.debug(
5464                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
5465                )
5466
5467                # Create Database
5468                database = Database(
5469                    database=annotation,
5470                    databases_folders=databases_folders,
5471                    assembly=assembly,
5472                )
5473
5474                # Find files
5475                parquet_file = database.get_database()
5476                parquet_hdr_file = database.get_header_file()
5477                parquet_type = database.get_type()
5478
5479                # Check if files exists
5480                if not parquet_file or not parquet_hdr_file:
5481                    log.error("Annotation failed: file not found")
5482                    raise ValueError("Annotation failed: file not found")
5483                else:
5484                    # Get parquet connexion
5485                    parquet_sql_attach = database.get_sql_database_attach(
5486                        output="query"
5487                    )
5488                    if parquet_sql_attach:
5489                        self.conn.execute(parquet_sql_attach)
5490                    parquet_file_link = database.get_sql_database_link()
5491                    # Log
5492                    log.debug(
5493                        f"Annotation '{annotation_name}' - file: "
5494                        + str(parquet_file)
5495                        + " and "
5496                        + str(parquet_hdr_file)
5497                    )
5498
5499                    # Database full header columns
5500                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
5501                        parquet_hdr_file
5502                    )
5503                    # Log
5504                    log.debug(
5505                        "Annotation database header columns : "
5506                        + str(parquet_hdr_vcf_header_columns)
5507                    )
5508
5509                    # Load header as VCF object
5510                    parquet_hdr_vcf_header_infos = database.get_header().infos
5511                    # Log
5512                    log.debug(
5513                        "Annotation database header: "
5514                        + str(parquet_hdr_vcf_header_infos)
5515                    )
5516
5517                    # Get extra infos
5518                    parquet_columns = database.get_extra_columns()
5519                    # Log
5520                    log.debug("Annotation database Columns: " + str(parquet_columns))
5521
5522                    # Add extra columns if "ALL" in annotation_fields
5523                    # if "ALL" in annotation_fields:
5524                    #     allow_add_extra_column = True
5525                    if "ALL" in annotation_fields and database.get_extra_columns():
5526                        for extra_column in database.get_extra_columns():
5527                            if (
5528                                extra_column not in annotation_fields
5529                                and extra_column.replace("INFO/", "")
5530                                not in parquet_hdr_vcf_header_infos
5531                            ):
5532                                parquet_hdr_vcf_header_infos[extra_column] = (
5533                                    vcf.parser._Info(
5534                                        extra_column,
5535                                        ".",
5536                                        "String",
5537                                        f"{extra_column} description",
5538                                        "unknown",
5539                                        "unknown",
5540                                        self.code_type_map["String"],
5541                                    )
5542                                )
5543
5544                    # For all fields in database
5545                    annotation_fields_all = False
5546                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
5547                        annotation_fields_all = True
5548                        annotation_fields = {
5549                            key: key for key in parquet_hdr_vcf_header_infos
5550                        }
5551
5552                        log.debug(
5553                            "Annotation database header - All annotations added: "
5554                            + str(annotation_fields)
5555                        )
5556
5557                    # Init
5558
5559                    # List of annotation fields to use
5560                    sql_query_annotation_update_info_sets = []
5561
5562                    # List of annotation to agregate
5563                    sql_query_annotation_to_agregate = []
5564
5565                    # Number of fields
5566                    nb_annotation_field = 0
5567
5568                    # Annotation fields processed
5569                    annotation_fields_processed = []
5570
5571                    # Columns mapping
5572                    map_columns = database.map_columns(
5573                        columns=annotation_fields, prefixes=["INFO/"]
5574                    )
5575
5576                    # Query dict for fields to remove (update option)
5577                    query_dict_remove = {}
5578
5579                    # Fetch Anotation fields
5580                    for annotation_field in annotation_fields:
5581
5582                        # annotation_field_column
5583                        annotation_field_column = map_columns.get(
5584                            annotation_field, "INFO"
5585                        )
5586
5587                        # field new name, if parametered
5588                        annotation_fields_new_name = annotation_fields.get(
5589                            annotation_field, annotation_field
5590                        )
5591                        if not annotation_fields_new_name:
5592                            annotation_fields_new_name = annotation_field
5593
5594                        # To annotate
5595                        # force_update_annotation = True
5596                        # force_append_annotation = True
5597                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
5598                        if annotation_field in parquet_hdr_vcf_header_infos and (
5599                            force_update_annotation
5600                            or force_append_annotation
5601                            or (
5602                                annotation_fields_new_name
5603                                not in self.get_header().infos
5604                            )
5605                        ):
5606
5607                            # Add field to annotation to process list
5608                            annotation_fields_processed.append(
5609                                annotation_fields_new_name
5610                            )
5611
5612                            # explode infos for the field
5613                            annotation_fields_new_name_info_msg = ""
5614                            if (
5615                                force_update_annotation
5616                                and annotation_fields_new_name
5617                                in self.get_header().infos
5618                            ):
5619                                # Remove field from INFO
5620                                query = f"""
5621                                    UPDATE {table_variants} as table_variants
5622                                    SET INFO = REGEXP_REPLACE(
5623                                                concat(table_variants.INFO,''),
5624                                                ';*{annotation_fields_new_name}=[^;]*',
5625                                                ''
5626                                                )
5627                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
5628                                """
5629                                annotation_fields_new_name_info_msg = " [update]"
5630                                query_dict_remove[
5631                                    f"remove 'INFO/{annotation_fields_new_name}'"
5632                                ] = query
5633
5634                            # Sep between fields in INFO
5635                            nb_annotation_field += 1
5636                            if nb_annotation_field > 1:
5637                                annotation_field_sep = ";"
5638                            else:
5639                                annotation_field_sep = ""
5640
5641                            log.info(
5642                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
5643                            )
5644
5645                            # Add INFO field to header
5646                            parquet_hdr_vcf_header_infos_number = (
5647                                parquet_hdr_vcf_header_infos[annotation_field].num
5648                                or "."
5649                            )
5650                            parquet_hdr_vcf_header_infos_type = (
5651                                parquet_hdr_vcf_header_infos[annotation_field].type
5652                                or "String"
5653                            )
5654                            parquet_hdr_vcf_header_infos_description = (
5655                                parquet_hdr_vcf_header_infos[annotation_field].desc
5656                                or f"{annotation_field} description"
5657                            )
5658                            parquet_hdr_vcf_header_infos_source = (
5659                                parquet_hdr_vcf_header_infos[annotation_field].source
5660                                or "unknown"
5661                            )
5662                            parquet_hdr_vcf_header_infos_version = (
5663                                parquet_hdr_vcf_header_infos[annotation_field].version
5664                                or "unknown"
5665                            )
5666
5667                            vcf_reader.infos[annotation_fields_new_name] = (
5668                                vcf.parser._Info(
5669                                    annotation_fields_new_name,
5670                                    parquet_hdr_vcf_header_infos_number,
5671                                    parquet_hdr_vcf_header_infos_type,
5672                                    parquet_hdr_vcf_header_infos_description,
5673                                    parquet_hdr_vcf_header_infos_source,
5674                                    parquet_hdr_vcf_header_infos_version,
5675                                    self.code_type_map[
5676                                        parquet_hdr_vcf_header_infos_type
5677                                    ],
5678                                )
5679                            )
5680
5681                            # Append
5682                            if force_append_annotation:
5683                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
5684                            else:
5685                                query_case_when_append = ""
5686
5687                            # Annotation/Update query fields
5688                            # Found in INFO column
5689                            if (
5690                                annotation_field_column == "INFO"
5691                                and "INFO" in parquet_hdr_vcf_header_columns
5692                            ):
5693                                sql_query_annotation_update_info_sets.append(
5694                                    f"""
5695                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
5696                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
5697                                        ELSE ''
5698                                    END
5699                                """
5700                                )
5701                            # Found in a specific column
5702                            else:
5703                                sql_query_annotation_update_info_sets.append(
5704                                    f"""
5705                                CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append}
5706                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ','))
5707                                        ELSE ''
5708                                    END
5709                                """
5710                                )
5711                                sql_query_annotation_to_agregate.append(
5712                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
5713                                )
5714
5715                        # Not to annotate
5716                        else:
5717
5718                            if force_update_annotation:
5719                                annotation_message = "forced"
5720                            else:
5721                                annotation_message = "skipped"
5722
5723                            if annotation_field not in parquet_hdr_vcf_header_infos:
5724                                log.warning(
5725                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
5726                                )
5727                            if annotation_fields_new_name in self.get_header().infos:
5728                                log.warning(
5729                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
5730                                )
5731
5732                    # Check if ALL fields have to be annotated. Thus concat all INFO field
5733                    # allow_annotation_full_info = True
5734                    allow_annotation_full_info = not force_append_annotation
5735
5736                    if parquet_type in ["regions"]:
5737                        allow_annotation_full_info = False
5738
5739                    if (
5740                        allow_annotation_full_info
5741                        and nb_annotation_field == len(annotation_fields)
5742                        and annotation_fields_all
5743                        and (
5744                            "INFO" in parquet_hdr_vcf_header_columns
5745                            and "INFO" in database.get_extra_columns()
5746                        )
5747                    ):
5748                        log.debug("Column INFO annotation enabled")
5749                        sql_query_annotation_update_info_sets = []
5750                        sql_query_annotation_update_info_sets.append(
5751                            f" table_parquet.INFO "
5752                        )
5753
5754                    if sql_query_annotation_update_info_sets:
5755
5756                        # Annotate
5757                        log.info(f"Annotation '{annotation_name}' - Annotation...")
5758
5759                        # Join query annotation update info sets for SQL
5760                        sql_query_annotation_update_info_sets_sql = ",".join(
5761                            sql_query_annotation_update_info_sets
5762                        )
5763
5764                        # Check chromosomes list (and variants infos)
5765                        sql_query_chromosomes = f"""
5766                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
5767                            FROM {table_variants} as table_variants
5768                            GROUP BY table_variants."#CHROM"
5769                            ORDER BY table_variants."#CHROM"
5770                            """
5771                        sql_query_chromosomes_df = self.conn.execute(
5772                            sql_query_chromosomes
5773                        ).df()
5774                        sql_query_chromosomes_dict = {
5775                            entry["CHROM"]: {
5776                                "count": entry["count_variants"],
5777                                "min": entry["min_variants"],
5778                                "max": entry["max_variants"],
5779                            }
5780                            for index, entry in sql_query_chromosomes_df.iterrows()
5781                        }
5782
5783                        # Init
5784                        nb_of_query = 0
5785                        nb_of_variant_annotated = 0
5786                        query_dict = query_dict_remove
5787
5788                        # for chrom in sql_query_chromosomes_df["CHROM"]:
5789                        for chrom in sql_query_chromosomes_dict:
5790
5791                            # Number of variant by chromosome
5792                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
5793                                chrom, {}
5794                            ).get("count", 0)
5795
5796                            log.debug(
5797                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
5798                            )
5799
5800                            # Annotation with regions database
5801                            if parquet_type in ["regions"]:
5802                                sql_query_annotation_from_clause = f"""
5803                                    FROM (
5804                                        SELECT 
5805                                            '{chrom}' AS \"#CHROM\",
5806                                            table_variants_from.\"POS\" AS \"POS\",
5807                                            {",".join(sql_query_annotation_to_agregate)}
5808                                        FROM {table_variants} as table_variants_from
5809                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
5810                                            table_parquet_from."#CHROM" = '{chrom}'
5811                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
5812                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
5813                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
5814                                                )
5815                                        )
5816                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
5817                                        GROUP BY table_variants_from.\"POS\"
5818                                        )
5819                                        as table_parquet
5820                                """
5821
5822                                sql_query_annotation_where_clause = """
5823                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
5824                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5825                                """
5826
5827                            # Annotation with variants database
5828                            else:
5829                                sql_query_annotation_from_clause = f"""
5830                                    FROM {parquet_file_link} as table_parquet
5831                                """
5832                                sql_query_annotation_where_clause = f"""
5833                                    table_variants."#CHROM" = '{chrom}'
5834                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
5835                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5836                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
5837                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
5838                                """
5839
5840                            # Create update query
5841                            sql_query_annotation_chrom_interval_pos = f"""
5842                                UPDATE {table_variants} as table_variants
5843                                    SET INFO = 
5844                                        concat(
5845                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5846                                                THEN table_variants.INFO
5847                                                ELSE ''
5848                                            END
5849                                            ,
5850                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5851                                                        AND (
5852                                                        concat({sql_query_annotation_update_info_sets_sql})
5853                                                        )
5854                                                        NOT IN ('','.') 
5855                                                    THEN ';'
5856                                                    ELSE ''
5857                                            END
5858                                            ,
5859                                            {sql_query_annotation_update_info_sets_sql}
5860                                            )
5861                                    {sql_query_annotation_from_clause}
5862                                    WHERE {sql_query_annotation_where_clause}
5863                                    ;
5864                                """
5865
5866                            # Add update query to dict
5867                            query_dict[
5868                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
5869                            ] = sql_query_annotation_chrom_interval_pos
5870
5871                        nb_of_query = len(query_dict)
5872                        num_query = 0
5873
5874                        # SET max_expression_depth TO x
5875                        self.conn.execute("SET max_expression_depth TO 10000")
5876
5877                        for query_name in query_dict:
5878                            query = query_dict[query_name]
5879                            num_query += 1
5880                            log.info(
5881                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
5882                            )
5883                            result = self.conn.execute(query)
5884                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
5885                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
5886                            log.info(
5887                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
5888                            )
5889
5890                        log.info(
5891                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
5892                        )
5893
5894                    else:
5895
5896                        log.info(
5897                            f"Annotation '{annotation_name}' - No Annotations available"
5898                        )
5899
5900                    log.debug("Final header: " + str(vcf_reader.infos))
5901
5902        # Remove added columns
5903        for added_column in added_columns:
5904            self.drop_column(column=added_column)

It takes a VCF file, and annotates it with a parquet file

Parameters
  • threads: number of threads to use for the annotation
Returns

the value of the variable "result".

def annotation_splice(self, threads: int = None) -> None:
5906    def annotation_splice(self, threads: int = None) -> None:
5907        """
5908        This function annotate with snpEff
5909
5910        :param threads: The number of threads to use
5911        :return: the value of the variable "return_value".
5912        """
5913
5914        # DEBUG
5915        log.debug("Start annotation with splice tools")
5916
5917        # Threads
5918        if not threads:
5919            threads = self.get_threads()
5920        log.debug("Threads: " + str(threads))
5921
5922        # DEBUG
5923        delete_tmp = True
5924        if self.get_config().get("verbosity", "warning") in ["debug"]:
5925            delete_tmp = False
5926            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5927
5928        # Config
5929        config = self.get_config()
5930        log.debug("Config: " + str(config))
5931        splice_config = config.get("tools", {}).get("splice", {})
5932        if not splice_config:
5933            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
5934        if not splice_config:
5935            msg_err = "No Splice tool config"
5936            log.error(msg_err)
5937            raise ValueError(msg_err)
5938        log.debug(f"splice_config={splice_config}")
5939
5940        # Config - Folders - Databases
5941        databases_folders = (
5942            config.get("folders", {}).get("databases", {}).get("splice", ["."])
5943        )
5944        log.debug("Databases annotations: " + str(databases_folders))
5945
5946        # Splice docker image
5947        splice_docker_image = splice_config.get("docker").get("image")
5948
5949        # Pull splice image if it's not already there
5950        if not check_docker_image_exists(splice_docker_image):
5951            log.warning(
5952                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
5953            )
5954            try:
5955                command(f"docker pull {splice_config.get('docker').get('image')}")
5956            except subprocess.CalledProcessError:
5957                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
5958                log.error(msg_err)
5959                raise ValueError(msg_err)
5960                return None
5961
5962        # Config - splice databases
5963        splice_databases = (
5964            config.get("folders", {})
5965            .get("databases", {})
5966            .get("splice", DEFAULT_SPLICE_FOLDER)
5967        )
5968        splice_databases = full_path(splice_databases)
5969
5970        # Param
5971        param = self.get_param()
5972        log.debug("Param: " + str(param))
5973
5974        # Param
5975        options = param.get("annotation", {}).get("splice", {})
5976        log.debug("Options: " + str(options))
5977
5978        # Data
5979        table_variants = self.get_table_variants()
5980
5981        # Check if not empty
5982        log.debug("Check if not empty")
5983        sql_query_chromosomes = (
5984            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5985        )
5986        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
5987            log.info("VCF empty")
5988            return None
5989
5990        # Export in VCF
5991        log.debug("Create initial file to annotate")
5992
5993        # Create output folder
5994        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
5995        if not os.path.exists(output_folder):
5996            Path(output_folder).mkdir(parents=True, exist_ok=True)
5997
5998        # Create tmp VCF file
5999        tmp_vcf = NamedTemporaryFile(
6000            prefix=self.get_prefix(),
6001            dir=output_folder,
6002            suffix=".vcf",
6003            delete=False,
6004        )
6005        tmp_vcf_name = tmp_vcf.name
6006
6007        # VCF header
6008        header = self.get_header()
6009
6010        # Existing annotations
6011        for vcf_annotation in self.get_header().infos:
6012
6013            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
6014            log.debug(
6015                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
6016            )
6017
6018        # Memory limit
6019        if config.get("memory", None):
6020            memory_limit = config.get("memory", "8G").upper()
6021            # upper()
6022        else:
6023            memory_limit = "8G"
6024        log.debug(f"memory_limit: {memory_limit}")
6025
6026        # Check number of variants to annotate
6027        where_clause_regex_spliceai = r"SpliceAI_\w+"
6028        where_clause_regex_spip = r"SPiP_\w+"
6029        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
6030        df_list_of_variants_to_annotate = self.get_query_to_df(
6031            query=f""" SELECT * FROM variants {where_clause} """
6032        )
6033        if len(df_list_of_variants_to_annotate) == 0:
6034            log.warning(
6035                f"No variants to annotate with splice. Variants probably already annotated with splice"
6036            )
6037            return None
6038        else:
6039            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
6040
6041        # Export VCF file
6042        self.export_variant_vcf(
6043            vcf_file=tmp_vcf_name,
6044            remove_info=True,
6045            add_samples=True,
6046            index=False,
6047            where_clause=where_clause,
6048        )
6049
6050        # Create docker container and launch splice analysis
6051        if splice_config:
6052
6053            # Splice mount folders
6054            mount_folders = splice_config.get("mount", {})
6055
6056            # Genome mount
6057            mount_folders[
6058                config.get("folders", {})
6059                .get("databases", {})
6060                .get("genomes", DEFAULT_GENOME_FOLDER)
6061            ] = "ro"
6062
6063            # SpliceAI mount
6064            mount_folders[
6065                config.get("folders", {})
6066                .get("databases", {})
6067                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
6068            ] = "ro"
6069
6070            # Genome mount
6071            mount_folders[
6072                config.get("folders", {})
6073                .get("databases", {})
6074                .get("spip", DEFAULT_SPIP_FOLDER)
6075            ] = "ro"
6076
6077            # Mount folders
6078            mount = []
6079
6080            # Config mount
6081            mount = [
6082                f"-v {full_path(path)}:{full_path(path)}:{mode}"
6083                for path, mode in mount_folders.items()
6084            ]
6085
6086            if any(value for value in splice_config.values() if value is None):
6087                log.warning("At least one splice config parameter is empty")
6088                return None
6089
6090            # Params in splice nf
6091            def check_values(dico: dict):
6092                """
6093                Ensure parameters for NF splice pipeline
6094                """
6095                for key, val in dico.items():
6096                    if key == "genome":
6097                        if any(
6098                            assemb in options.get("genome", {})
6099                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
6100                        ):
6101                            yield f"--{key} hg19"
6102                        elif any(
6103                            assemb in options.get("genome", {})
6104                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
6105                        ):
6106                            yield f"--{key} hg38"
6107                    elif (
6108                        (isinstance(val, str) and val)
6109                        or isinstance(val, int)
6110                        or isinstance(val, bool)
6111                    ):
6112                        yield f"--{key} {val}"
6113
6114            # Genome
6115            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
6116            options["genome"] = genome
6117
6118            # NF params
6119            nf_params = []
6120
6121            # Add options
6122            if options:
6123                nf_params = list(check_values(options))
6124                log.debug(f"Splice NF params: {' '.join(nf_params)}")
6125            else:
6126                log.debug("No NF params provided")
6127
6128            # Add threads
6129            if "threads" not in options.keys():
6130                nf_params.append(f"--threads {threads}")
6131
6132            # Genome path
6133            genome_path = find_genome(
6134                config.get("folders", {})
6135                .get("databases", {})
6136                .get("genomes", DEFAULT_GENOME_FOLDER),
6137                file=f"{genome}.fa",
6138            )
6139            # Add genome path
6140            if not genome_path:
6141                raise ValueError(
6142                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
6143                )
6144            else:
6145                log.debug(f"Genome: {genome_path}")
6146                nf_params.append(f"--genome_path {genome_path}")
6147
6148            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
6149                """
6150                Setting up updated databases for SPiP and SpliceAI
6151                """
6152
6153                try:
6154
6155                    # SpliceAI assembly transcriptome
6156                    spliceai_assembly = os.path.join(
6157                        config.get("folders", {})
6158                        .get("databases", {})
6159                        .get("spliceai", {}),
6160                        options.get("genome"),
6161                        "transcriptome",
6162                    )
6163                    spip_assembly = options.get("genome")
6164
6165                    spip = find(
6166                        f"transcriptome_{spip_assembly}.RData",
6167                        config.get("folders", {}).get("databases", {}).get("spip", {}),
6168                    )
6169                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
6170                    log.debug(f"SPiP annotations: {spip}")
6171                    log.debug(f"SpliceAI annotations: {spliceai}")
6172                    if spip and spliceai:
6173                        return [
6174                            f"--spip_transcriptome {spip}",
6175                            f"--spliceai_annotations {spliceai}",
6176                        ]
6177                    else:
6178                        # TODO crash and go on with basic annotations ?
6179                        # raise ValueError(
6180                        #     "Can't find splice databases in configuration EXIT"
6181                        # )
6182                        log.warning(
6183                            "Can't find splice databases in configuration, use annotations file from image"
6184                        )
6185                except TypeError:
6186                    log.warning(
6187                        "Can't find splice databases in configuration, use annotations file from image"
6188                    )
6189                    return []
6190
6191            # Add options, check if transcriptome option have already beend provided
6192            if (
6193                "spip_transcriptome" not in nf_params
6194                and "spliceai_transcriptome" not in nf_params
6195            ):
6196                splice_reference = splice_annotations(options, config)
6197                if splice_reference:
6198                    nf_params.extend(splice_reference)
6199
6200            nf_params.append(f"--output_folder {output_folder}")
6201
6202            random_uuid = f"HOWARD-SPLICE-{get_random()}"
6203            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
6204            log.debug(cmd)
6205
6206            splice_config["docker"]["command"] = cmd
6207
6208            docker_cmd = get_bin_command(
6209                tool="splice",
6210                bin_type="docker",
6211                config=config,
6212                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
6213                add_options=f"--name {random_uuid} {' '.join(mount)}",
6214            )
6215
6216            # Docker debug
6217            # if splice_config.get("rm_container"):
6218            #     rm_container = "--rm"
6219            # else:
6220            #     rm_container = ""
6221            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
6222
6223            log.debug(docker_cmd)
6224            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
6225            log.debug(res.stdout)
6226            if res.stderr:
6227                log.error(res.stderr)
6228            res.check_returncode()
6229        else:
6230            log.warning(f"Splice tool configuration not found: {config}")
6231
6232        # Update variants
6233        log.info("Annotation - Updating...")
6234        # Test find output vcf
6235        log.debug(
6236            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6237        )
6238        output_vcf = []
6239        # Wrong folder to look in
6240        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
6241            if (
6242                files
6243                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6244            ):
6245                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
6246        # log.debug(os.listdir(options.get("output_folder")))
6247        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
6248        if not output_vcf:
6249            log.debug(
6250                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
6251            )
6252        else:
6253            # Get new header from annotated vcf
6254            log.debug(f"Initial header: {len(header.infos)} fields")
6255            # Create new header with splice infos
6256            new_vcf = Variants(input=output_vcf[0])
6257            new_vcf_header = new_vcf.get_header().infos
6258            for keys, infos in new_vcf_header.items():
6259                if keys not in header.infos.keys():
6260                    header.infos[keys] = infos
6261            log.debug(f"New header: {len(header.infos)} fields")
6262            log.debug(f"Splice tmp output: {output_vcf[0]}")
6263            self.update_from_vcf(output_vcf[0])
6264
6265        # Remove folder
6266        remove_if_exists(output_folder)

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def get_config_default(self, name: str) -> dict:
6272    def get_config_default(self, name: str) -> dict:
6273        """
6274        The function `get_config_default` returns a dictionary containing default configurations for
6275        various calculations and prioritizations.
6276
6277        :param name: The `get_config_default` function returns a dictionary containing default
6278        configurations for different calculations and prioritizations. The `name` parameter is used to
6279        specify which specific configuration to retrieve from the dictionary
6280        :type name: str
6281        :return: The function `get_config_default` returns a dictionary containing default configuration
6282        settings for different calculations and prioritizations. The specific configuration settings are
6283        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
6284        matches a key in the `config_default` dictionary, the corresponding configuration settings are
6285        returned. If there is no match, an empty dictionary is returned.
6286        """
6287
6288        config_default = {
6289            "calculations": {
6290                "variant_chr_pos_alt_ref": {
6291                    "type": "sql",
6292                    "name": "variant_chr_pos_alt_ref",
6293                    "description": "Create a variant ID with chromosome, position, alt and ref",
6294                    "available": False,
6295                    "output_column_name": "variant_chr_pos_alt_ref",
6296                    "output_column_type": "String",
6297                    "output_column_description": "variant ID with chromosome, position, alt and ref",
6298                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
6299                    "operation_info": True,
6300                },
6301                "VARTYPE": {
6302                    "type": "sql",
6303                    "name": "VARTYPE",
6304                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
6305                    "available": True,
6306                    "output_column_name": "VARTYPE",
6307                    "output_column_type": "String",
6308                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
6309                    "operation_query": """
6310                            CASE
6311                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
6312                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
6313                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
6314                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
6315                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
6316                                ELSE 'UNDEFINED'
6317                            END
6318                            """,
6319                    "info_fields": ["SVTYPE"],
6320                    "operation_info": True,
6321                },
6322                "snpeff_hgvs": {
6323                    "type": "python",
6324                    "name": "snpeff_hgvs",
6325                    "description": "HGVS nomenclatures from snpEff annotation",
6326                    "available": True,
6327                    "function_name": "calculation_extract_snpeff_hgvs",
6328                    "function_params": ["snpeff_hgvs", "ANN"],
6329                },
6330                "snpeff_ann_explode": {
6331                    "type": "python",
6332                    "name": "snpeff_ann_explode",
6333                    "description": "Explode snpEff annotations with uniquify values",
6334                    "available": True,
6335                    "function_name": "calculation_snpeff_ann_explode",
6336                    "function_params": [False, "fields", "snpeff_", "ANN"],
6337                },
6338                "snpeff_ann_explode_uniquify": {
6339                    "type": "python",
6340                    "name": "snpeff_ann_explode_uniquify",
6341                    "description": "Explode snpEff annotations",
6342                    "available": True,
6343                    "function_name": "calculation_snpeff_ann_explode",
6344                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
6345                },
6346                "snpeff_ann_explode_json": {
6347                    "type": "python",
6348                    "name": "snpeff_ann_explode_json",
6349                    "description": "Explode snpEff annotations in JSON format",
6350                    "available": True,
6351                    "function_name": "calculation_snpeff_ann_explode",
6352                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
6353                },
6354                "NOMEN": {
6355                    "type": "python",
6356                    "name": "NOMEN",
6357                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
6358                    "available": True,
6359                    "function_name": "calculation_extract_nomen",
6360                    "function_params": [],
6361                },
6362                "FINDBYPIPELINE": {
6363                    "type": "python",
6364                    "name": "FINDBYPIPELINE",
6365                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
6366                    "available": True,
6367                    "function_name": "calculation_find_by_pipeline",
6368                    "function_params": ["findbypipeline"],
6369                },
6370                "FINDBYSAMPLE": {
6371                    "type": "python",
6372                    "name": "FINDBYSAMPLE",
6373                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
6374                    "available": True,
6375                    "function_name": "calculation_find_by_pipeline",
6376                    "function_params": ["findbysample"],
6377                },
6378                "GENOTYPECONCORDANCE": {
6379                    "type": "python",
6380                    "name": "GENOTYPECONCORDANCE",
6381                    "description": "Concordance of genotype for multi caller VCF",
6382                    "available": True,
6383                    "function_name": "calculation_genotype_concordance",
6384                    "function_params": [],
6385                },
6386                "BARCODE": {
6387                    "type": "python",
6388                    "name": "BARCODE",
6389                    "description": "BARCODE as VaRank tool",
6390                    "available": True,
6391                    "function_name": "calculation_barcode",
6392                    "function_params": [],
6393                },
6394                "BARCODEFAMILY": {
6395                    "type": "python",
6396                    "name": "BARCODEFAMILY",
6397                    "description": "BARCODEFAMILY as VaRank tool",
6398                    "available": True,
6399                    "function_name": "calculation_barcode_family",
6400                    "function_params": ["BCF"],
6401                },
6402                "TRIO": {
6403                    "type": "python",
6404                    "name": "TRIO",
6405                    "description": "Inheritance for a trio family",
6406                    "available": True,
6407                    "function_name": "calculation_trio",
6408                    "function_params": [],
6409                },
6410                "VAF": {
6411                    "type": "python",
6412                    "name": "VAF",
6413                    "description": "Variant Allele Frequency (VAF) harmonization",
6414                    "available": True,
6415                    "function_name": "calculation_vaf_normalization",
6416                    "function_params": [],
6417                },
6418                "VAF_stats": {
6419                    "type": "python",
6420                    "name": "VAF_stats",
6421                    "description": "Variant Allele Frequency (VAF) statistics",
6422                    "available": True,
6423                    "function_name": "calculation_genotype_stats",
6424                    "function_params": ["VAF"],
6425                },
6426                "DP_stats": {
6427                    "type": "python",
6428                    "name": "DP_stats",
6429                    "description": "Depth (DP) statistics",
6430                    "available": True,
6431                    "function_name": "calculation_genotype_stats",
6432                    "function_params": ["DP"],
6433                },
6434                "variant_id": {
6435                    "type": "python",
6436                    "name": "variant_id",
6437                    "description": "Variant ID generated from variant position and type",
6438                    "available": True,
6439                    "function_name": "calculation_variant_id",
6440                    "function_params": [],
6441                },
6442            },
6443            "prioritizations": {
6444                "default": {
6445                    "filter": [
6446                        {
6447                            "type": "notequals",
6448                            "value": "!PASS|\\.",
6449                            "score": 0,
6450                            "flag": "FILTERED",
6451                            "comment": ["Bad variant quality"],
6452                        },
6453                        {
6454                            "type": "equals",
6455                            "value": "REJECT",
6456                            "score": -20,
6457                            "flag": "PASS",
6458                            "comment": ["Bad variant quality"],
6459                        },
6460                    ],
6461                    "DP": [
6462                        {
6463                            "type": "gte",
6464                            "value": "50",
6465                            "score": 5,
6466                            "flag": "PASS",
6467                            "comment": ["DP higher than 50"],
6468                        }
6469                    ],
6470                    "ANN": [
6471                        {
6472                            "type": "contains",
6473                            "value": "HIGH",
6474                            "score": 5,
6475                            "flag": "PASS",
6476                            "comment": [
6477                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
6478                            ],
6479                        },
6480                        {
6481                            "type": "contains",
6482                            "value": "MODERATE",
6483                            "score": 3,
6484                            "flag": "PASS",
6485                            "comment": [
6486                                "A non-disruptive variant that might change protein effectiveness"
6487                            ],
6488                        },
6489                        {
6490                            "type": "contains",
6491                            "value": "LOW",
6492                            "score": 0,
6493                            "flag": "FILTERED",
6494                            "comment": [
6495                                "Assumed to be mostly harmless or unlikely to change protein behavior"
6496                            ],
6497                        },
6498                        {
6499                            "type": "contains",
6500                            "value": "MODIFIER",
6501                            "score": 0,
6502                            "flag": "FILTERED",
6503                            "comment": [
6504                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
6505                            ],
6506                        },
6507                    ],
6508                }
6509            },
6510        }
6511
6512        return config_default.get(name, None)

The function get_config_default returns a dictionary containing default configurations for various calculations and prioritizations.

Parameters
  • name: The get_config_default function returns a dictionary containing default configurations for different calculations and prioritizations. The name parameter is used to specify which specific configuration to retrieve from the dictionary
Returns

The function get_config_default returns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the input name parameter provided to the function. If the name parameter matches a key in the config_default dictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.

def get_config_json(self, name: str, config_dict: dict = {}, config_file: str = None) -> dict:
6514    def get_config_json(
6515        self, name: str, config_dict: dict = {}, config_file: str = None
6516    ) -> dict:
6517        """
6518        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
6519        default values, a dictionary, and a file.
6520
6521        :param name: The `name` parameter in the `get_config_json` function is a string that represents
6522        the name of the configuration. It is used to identify and retrieve the configuration settings
6523        for a specific component or module
6524        :type name: str
6525        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
6526        dictionary that allows you to provide additional configuration settings or overrides. When you
6527        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
6528        the key is the configuration setting you want to override or
6529        :type config_dict: dict
6530        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
6531        specify the path to a configuration file that contains additional settings. If provided, the
6532        function will read the contents of this file and update the configuration dictionary with the
6533        values found in the file, overriding any existing values with the
6534        :type config_file: str
6535        :return: The function `get_config_json` returns a dictionary containing the configuration
6536        settings.
6537        """
6538
6539        # Create with default prioritizations
6540        config_default = self.get_config_default(name=name)
6541        configuration = config_default
6542        # log.debug(f"configuration={configuration}")
6543
6544        # Replace prioritizations from dict
6545        for config in config_dict:
6546            configuration[config] = config_dict[config]
6547
6548        # Replace prioritizations from file
6549        config_file = full_path(config_file)
6550        if config_file:
6551            if os.path.exists(config_file):
6552                with open(config_file) as config_file_content:
6553                    config_file_dict = json.load(config_file_content)
6554                for config in config_file_dict:
6555                    configuration[config] = config_file_dict[config]
6556            else:
6557                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
6558                log.error(msg_error)
6559                raise ValueError(msg_error)
6560
6561        return configuration

The function get_config_json retrieves a configuration JSON object with prioritizations from default values, a dictionary, and a file.

Parameters
  • name: The name parameter in the get_config_json function is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module
  • config_dict: The config_dict parameter in the get_config_json function is a dictionary that allows you to provide additional configuration settings or overrides. When you call the get_config_json function, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or
  • config_file: The config_file parameter in the get_config_json function is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns

The function get_config_json returns a dictionary containing the configuration settings.

def prioritization(self) -> None:
6563    def prioritization(self) -> None:
6564        """
6565        It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other
6566        INFO fields
6567        """
6568
6569        # Config
6570        config = self.get_config()
6571
6572        # Param
6573        param = self.get_param()
6574
6575        # Quick Prioritizations
6576        # prioritizations = param.get("prioritization", {}).get("prioritizations", "")
6577
6578        # Configuration profiles
6579        prioritization_config_file = param.get("prioritization", {}).get(
6580            "prioritization_config", None
6581        )
6582        prioritization_config_file = full_path(prioritization_config_file)
6583        prioritizations_config = self.get_config_json(
6584            name="prioritizations", config_file=prioritization_config_file
6585        )
6586
6587        # Prioritization options
6588        profiles = param.get("prioritization", {}).get("profiles", [])
6589        if isinstance(profiles, str):
6590            profiles = profiles.split(",")
6591        pzfields = param.get("prioritization", {}).get(
6592            "pzfields", ["PZFlag", "PZScore"]
6593        )
6594        if isinstance(pzfields, str):
6595            pzfields = pzfields.split(",")
6596        default_profile = param.get("prioritization", {}).get("default_profile", None)
6597        pzfields_sep = param.get("prioritization", {}).get("pzfields_sep", "_")
6598        prioritization_score_mode = param.get("prioritization", {}).get(
6599            "prioritization_score_mode", "HOWARD"
6600        )
6601
6602        # Quick Prioritizations
6603        # prioritizations = param.get("prioritization", {}).get("prioritizations", None)
6604        prioritizations = param.get("prioritizations", None)
6605        if prioritizations:
6606            log.info("Quick Prioritization:")
6607            for profile in prioritizations.split(","):
6608                if profile not in profiles:
6609                    profiles.append(profile)
6610                    log.info(f"   {profile}")
6611
6612        # If profile "ALL" provided, all profiles in the config profiles
6613        if "ALL" in profiles:
6614            profiles = list(prioritizations_config.keys())
6615
6616        for profile in profiles:
6617            if prioritizations_config.get(profile, None):
6618                log.debug(f"Profile '{profile}' configured")
6619            else:
6620                msg_error = f"Profile '{profile}' NOT configured"
6621                log.error(msg_error)
6622                raise ValueError(msg_error)
6623
6624        if profiles:
6625            log.info(f"Prioritization... ")
6626        else:
6627            log.debug(f"No profile defined")
6628            return
6629
6630        if not default_profile and len(profiles):
6631            default_profile = profiles[0]
6632
6633        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
6634        log.debug("Profiles to check: " + str(list(profiles)))
6635
6636        # Variables
6637        table_variants = self.get_table_variants(clause="update")
6638
6639        # Added columns
6640        added_columns = []
6641
6642        # Create list of PZfields
6643        # List of PZFields
6644        list_of_pzfields_original = pzfields + [
6645            pzfield + pzfields_sep + profile
6646            for pzfield in pzfields
6647            for profile in profiles
6648        ]
6649        list_of_pzfields = []
6650        log.debug(f"{list_of_pzfields_original}")
6651
6652        # Remove existing PZfields to use if exists
6653        for pzfield in list_of_pzfields_original:
6654            if self.get_header().infos.get(pzfield, None) is None:
6655                list_of_pzfields.append(pzfield)
6656                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
6657            else:
6658                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
6659
6660        if list_of_pzfields:
6661
6662            # Explode Infos fields
6663            explode_infos_prefix = self.get_explode_infos_prefix()
6664            added_columns += self.explode_infos(prefix=explode_infos_prefix)
6665            extra_infos = self.get_extra_infos()
6666
6667            # PZfields tags description
6668            PZfields_INFOS = {
6669                "PZTags": {
6670                    "ID": "PZTags",
6671                    "Number": ".",
6672                    "Type": "String",
6673                    "Description": "Variant tags based on annotation criteria",
6674                },
6675                "PZScore": {
6676                    "ID": "PZScore",
6677                    "Number": 1,
6678                    "Type": "Integer",
6679                    "Description": "Variant score based on annotation criteria",
6680                },
6681                "PZFlag": {
6682                    "ID": "PZFlag",
6683                    "Number": 1,
6684                    "Type": "String",
6685                    "Description": "Variant flag based on annotation criteria",
6686                },
6687                "PZComment": {
6688                    "ID": "PZComment",
6689                    "Number": ".",
6690                    "Type": "String",
6691                    "Description": "Variant comment based on annotation criteria",
6692                },
6693                "PZInfos": {
6694                    "ID": "PZInfos",
6695                    "Number": ".",
6696                    "Type": "String",
6697                    "Description": "Variant infos based on annotation criteria",
6698                },
6699            }
6700
6701            # Create INFO fields if not exist
6702            for field in PZfields_INFOS:
6703                field_ID = PZfields_INFOS[field]["ID"]
6704                field_description = PZfields_INFOS[field]["Description"]
6705                if field_ID not in self.get_header().infos and field_ID in pzfields:
6706                    field_description = (
6707                        PZfields_INFOS[field]["Description"]
6708                        + f", profile {default_profile}"
6709                    )
6710                    self.get_header().infos[field_ID] = vcf.parser._Info(
6711                        field_ID,
6712                        PZfields_INFOS[field]["Number"],
6713                        PZfields_INFOS[field]["Type"],
6714                        field_description,
6715                        "unknown",
6716                        "unknown",
6717                        code_type_map[PZfields_INFOS[field]["Type"]],
6718                    )
6719
6720            # Create INFO fields if not exist for each profile
6721            for profile in prioritizations_config:
6722                if profile in profiles or profiles == []:
6723                    for field in PZfields_INFOS:
6724                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
6725                        field_description = (
6726                            PZfields_INFOS[field]["Description"]
6727                            + f", profile {profile}"
6728                        )
6729                        if (
6730                            field_ID not in self.get_header().infos
6731                            and field in pzfields
6732                        ):
6733                            self.get_header().infos[field_ID] = vcf.parser._Info(
6734                                field_ID,
6735                                PZfields_INFOS[field]["Number"],
6736                                PZfields_INFOS[field]["Type"],
6737                                field_description,
6738                                "unknown",
6739                                "unknown",
6740                                code_type_map[PZfields_INFOS[field]["Type"]],
6741                            )
6742
6743            # Header
6744            for pzfield in list_of_pzfields:
6745                if re.match("PZScore.*", pzfield):
6746                    added_column = self.add_column(
6747                        table_name=table_variants,
6748                        column_name=pzfield,
6749                        column_type="INTEGER",
6750                        default_value="0",
6751                    )
6752                elif re.match("PZFlag.*", pzfield):
6753                    added_column = self.add_column(
6754                        table_name=table_variants,
6755                        column_name=pzfield,
6756                        column_type="BOOLEAN",
6757                        default_value="1",
6758                    )
6759                else:
6760                    added_column = self.add_column(
6761                        table_name=table_variants,
6762                        column_name=pzfield,
6763                        column_type="STRING",
6764                        default_value="''",
6765                    )
6766                added_columns.append(added_column)
6767
6768            # Profiles
6769            if profiles:
6770
6771                # foreach profile in configuration file
6772                for profile in prioritizations_config:
6773
6774                    # If profile is asked in param, or ALL are asked (empty profile [])
6775                    if profile in profiles or profiles == []:
6776                        log.info(f"Profile '{profile}'")
6777
6778                        sql_set_info_option = ""
6779
6780                        sql_set_info = []
6781
6782                        # PZ fields set
6783
6784                        # PZScore
6785                        if f"PZScore{pzfields_sep}{profile}" in list_of_pzfields:
6786                            sql_set_info.append(
6787                                f"""
6788                                    concat(
6789                                        'PZScore{pzfields_sep}{profile}=',
6790                                        PZScore{pzfields_sep}{profile}
6791                                    ) 
6792                                """
6793                            )
6794                            if (
6795                                profile == default_profile
6796                                and "PZScore" in list_of_pzfields
6797                            ):
6798                                sql_set_info.append(
6799                                    f"""
6800                                        concat(
6801                                            'PZScore=',
6802                                            PZScore{pzfields_sep}{profile}
6803                                        )
6804                                    """
6805                                )
6806
6807                        # PZFlag
6808                        if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields:
6809                            sql_set_info.append(
6810                                f"""
6811                                    concat(
6812                                        'PZFlag{pzfields_sep}{profile}=',
6813                                        CASE 
6814                                            WHEN PZFlag{pzfields_sep}{profile}==1
6815                                            THEN 'PASS'
6816                                            WHEN PZFlag{pzfields_sep}{profile}==0
6817                                            THEN 'FILTERED'
6818                                        END
6819                                    ) 
6820                                """
6821                            )
6822                            if (
6823                                profile == default_profile
6824                                and "PZFlag" in list_of_pzfields
6825                            ):
6826                                sql_set_info.append(
6827                                    f"""
6828                                        concat(
6829                                            'PZFlag=',
6830                                            CASE 
6831                                                WHEN PZFlag{pzfields_sep}{profile}==1
6832                                                THEN 'PASS'
6833                                                WHEN PZFlag{pzfields_sep}{profile}==0
6834                                                THEN 'FILTERED'
6835                                            END
6836                                        )
6837                                    """
6838                                )
6839
6840                        # PZComment
6841                        if f"PZComment{pzfields_sep}{profile}" in list_of_pzfields:
6842                            sql_set_info.append(
6843                                f"""
6844                                    CASE
6845                                        WHEN PZComment{pzfields_sep}{profile} NOT IN ('')
6846                                        THEN concat('PZComment{pzfields_sep}{profile}=', PZComment{pzfields_sep}{profile})
6847                                        ELSE ''
6848                                    END
6849                                """
6850                            )
6851                            if (
6852                                profile == default_profile
6853                                and "PZComment" in list_of_pzfields
6854                            ):
6855                                sql_set_info.append(
6856                                    f"""
6857                                        CASE
6858                                            WHEN PZComment{pzfields_sep}{profile} NOT IN ('')
6859                                            THEN concat('PZComment=', PZComment{pzfields_sep}{profile})
6860                                            ELSE ''
6861                                        END
6862                                    """
6863                                )
6864
6865                        # PZInfos
6866                        if f"PZInfos{pzfields_sep}{profile}" in list_of_pzfields:
6867                            sql_set_info.append(
6868                                f"""
6869                                    CASE
6870                                        WHEN PZInfos{pzfields_sep}{profile} NOT IN ('')
6871                                        THEN concat('PZInfos{pzfields_sep}{profile}=', PZInfos{pzfields_sep}{profile})
6872                                        ELSE ''
6873                                    END
6874                                """
6875                            )
6876                            if (
6877                                profile == default_profile
6878                                and "PZInfos" in list_of_pzfields
6879                            ):
6880                                sql_set_info.append(
6881                                    f"""
6882                                        CASE
6883                                            WHEN PZInfos{pzfields_sep}{profile} NOT IN ('')
6884                                            THEN concat('PZInfos=', PZInfos{pzfields_sep}{profile})
6885                                            ELSE ''
6886                                        END
6887                                    """
6888                                )
6889
6890                        # Merge PZfields
6891                        sql_set_info_option = ""
6892                        sql_set_sep = ""
6893                        for sql_set in sql_set_info:
6894                            if sql_set_sep:
6895                                sql_set_info_option += f"""
6896                                    , concat('{sql_set_sep}', {sql_set})
6897                                """
6898                            else:
6899                                sql_set_info_option += f"""
6900                                    , {sql_set}
6901                                """
6902                            sql_set_sep = ";"
6903
6904                        sql_queries = []
6905                        for annotation in prioritizations_config[profile]:
6906
6907                            # Check if annotation field is present
6908                            if not f"{explode_infos_prefix}{annotation}" in extra_infos:
6909                                log.debug(f"Annotation '{annotation}' not in data")
6910                                continue
6911                            else:
6912                                log.debug(f"Annotation '{annotation}' in data")
6913
6914                            # For each criterions
6915                            for criterion in prioritizations_config[profile][
6916                                annotation
6917                            ]:
6918                                criterion_type = criterion["type"]
6919                                criterion_value = criterion["value"]
6920                                criterion_score = criterion.get("score", 0)
6921                                criterion_flag = criterion.get("flag", "PASS")
6922                                criterion_flag_bool = criterion_flag == "PASS"
6923                                criterion_comment = (
6924                                    ", ".join(criterion.get("comment", []))
6925                                    .replace("'", "''")
6926                                    .replace(";", ",")
6927                                    .replace("\t", " ")
6928                                )
6929                                criterion_infos = (
6930                                    str(criterion)
6931                                    .replace("'", "''")
6932                                    .replace(";", ",")
6933                                    .replace("\t", " ")
6934                                )
6935
6936                                sql_set = []
6937                                sql_set_info = []
6938
6939                                # PZ fields set
6940                                if (
6941                                    f"PZScore{pzfields_sep}{profile}"
6942                                    in list_of_pzfields
6943                                ):
6944                                    if prioritization_score_mode == "HOWARD":
6945                                        sql_set.append(
6946                                            f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}"
6947                                        )
6948                                    elif prioritization_score_mode == "VaRank":
6949                                        sql_set.append(
6950                                            f"PZScore{pzfields_sep}{profile} = CASE WHEN {criterion_score}>PZScore{pzfields_sep}{profile} THEN {criterion_score} END"
6951                                        )
6952                                    else:
6953                                        sql_set.append(
6954                                            f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}"
6955                                        )
6956                                if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields:
6957                                    sql_set.append(
6958                                        f"PZFlag{pzfields_sep}{profile} = PZFlag{pzfields_sep}{profile} AND {criterion_flag_bool}"
6959                                    )
6960                                if (
6961                                    f"PZComment{pzfields_sep}{profile}"
6962                                    in list_of_pzfields
6963                                ):
6964                                    sql_set.append(
6965                                        f"""
6966                                            PZComment{pzfields_sep}{profile} = 
6967                                                concat(
6968                                                    PZComment{pzfields_sep}{profile},
6969                                                    CASE 
6970                                                        WHEN PZComment{pzfields_sep}{profile}!=''
6971                                                        THEN ', '
6972                                                        ELSE ''
6973                                                    END,
6974                                                    '{criterion_comment}'
6975                                                )
6976                                        """
6977                                    )
6978                                if (
6979                                    f"PZInfos{pzfields_sep}{profile}"
6980                                    in list_of_pzfields
6981                                ):
6982                                    sql_set.append(
6983                                        f"""
6984                                            PZInfos{pzfields_sep}{profile} = 
6985                                                concat(
6986                                                    PZInfos{pzfields_sep}{profile},
6987                                                    '{criterion_infos}'
6988                                                )
6989                                        """
6990                                    )
6991                                sql_set_option = ",".join(sql_set)
6992
6993                                # Criterion and comparison
6994                                try:
6995                                    float(criterion_value)
6996                                    sql_update = f"""
6997                                        UPDATE {table_variants}
6998                                        SET {sql_set_option}
6999                                        WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
7000                                        AND "{explode_infos_prefix}{annotation}"{comparison_map[criterion_type]}{criterion_value}
7001                                        """
7002                                except:
7003                                    contains_option = ""
7004                                    if criterion_type == "contains":
7005                                        contains_option = ".*"
7006                                    sql_update = f"""
7007                                        UPDATE {table_variants}
7008                                        SET {sql_set_option}
7009                                        WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
7010                                        """
7011                                sql_queries.append(sql_update)
7012
7013                        # PZTags
7014                        if f"PZTags{pzfields_sep}{profile}" in list_of_pzfields:
7015
7016                            # Create PZFalgs value
7017                            pztags_value = ""
7018                            pztags_sep_default = "|"
7019                            pztags_sep = ""
7020                            for pzfield in pzfields:
7021                                if pzfield not in ["PZTags"]:
7022                                    if (
7023                                        f"{pzfield}{pzfields_sep}{profile}"
7024                                        in list_of_pzfields
7025                                    ):
7026                                        if pzfield in ["PZFlag"]:
7027                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7028                                                CASE WHEN PZFlag{pzfields_sep}{profile}
7029                                                    THEN 'PASS'
7030                                                    ELSE 'FILTERED'
7031                                                END, '"""
7032                                        else:
7033                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
7034                                        pztags_sep = pztags_sep_default
7035
7036                            # Add Query update for PZFlags
7037                            sql_update_pztags = f"""
7038                                UPDATE {table_variants}
7039                                SET INFO = concat(
7040                                        INFO,
7041                                        CASE WHEN INFO NOT in ('','.')
7042                                                THEN ';'
7043                                                ELSE ''
7044                                        END,
7045                                        'PZTags{pzfields_sep}{profile}={pztags_value}'
7046                                    )
7047                                """
7048                            sql_queries.append(sql_update_pztags)
7049
7050                            # Add Query update for PZFlags for default
7051                            if profile == default_profile:
7052                                sql_update_pztags_default = f"""
7053                                UPDATE {table_variants}
7054                                SET INFO = concat(
7055                                        INFO,
7056                                        ';',
7057                                        'PZTags={pztags_value}'
7058                                    )
7059                                """
7060                                sql_queries.append(sql_update_pztags_default)
7061
7062                        log.info(f"""Profile '{profile}' - Prioritization... """)
7063
7064                        if sql_queries:
7065
7066                            for sql_query in sql_queries:
7067                                log.debug(
7068                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
7069                                )
7070                                self.conn.execute(sql_query)
7071
7072                        log.info(f"""Profile '{profile}' - Update... """)
7073                        sql_query_update = f"""
7074                            UPDATE {table_variants}
7075                            SET INFO =  
7076                                concat(
7077                                    CASE
7078                                        WHEN INFO NOT IN ('','.')
7079                                        THEN concat(INFO, ';')
7080                                        ELSE ''
7081                                    END
7082                                    {sql_set_info_option}
7083                                )
7084                        """
7085                        self.conn.execute(sql_query_update)
7086
7087        else:
7088
7089            log.warning(f"No profiles in parameters")
7090
7091        # Remove added columns
7092        for added_column in added_columns:
7093            self.drop_column(column=added_column)
7094
7095        # Explode INFOS fields into table fields
7096        if self.get_explode_infos():
7097            self.explode_infos(
7098                prefix=self.get_explode_infos_prefix(),
7099                fields=self.get_explode_infos_fields(),
7100                force=True,
7101            )
7102
7103        return

It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other INFO fields

def annotation_hgvs(self, threads: int = None) -> None:
7109    def annotation_hgvs(self, threads: int = None) -> None:
7110        """
7111        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
7112        coordinates and alleles.
7113
7114        :param threads: The `threads` parameter is an optional integer that specifies the number of
7115        threads to use for parallel processing. If no value is provided, it will default to the number
7116        of threads obtained from the `get_threads()` method
7117        :type threads: int
7118        """
7119
7120        # Function for each partition of the Dask Dataframe
7121        def partition_function(partition):
7122            """
7123            The function `partition_function` applies the `annotation_hgvs_partition` function to
7124            each row of a DataFrame called `partition`.
7125
7126            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
7127            to be processed
7128            :return: the result of applying the "annotation_hgvs_partition" function to each row of
7129            the "partition" dataframe along the axis 1.
7130            """
7131            return partition.apply(annotation_hgvs_partition, axis=1)
7132
7133        def annotation_hgvs_partition(row) -> str:
7134            """
7135            The function `annotation_hgvs_partition` takes in a row of data and returns a string
7136            containing a list of HGVS names associated with the given genomic coordinates and alleles.
7137
7138            :param row: A dictionary-like object that contains the values for the following keys:
7139            :return: a string that contains the HGVS names associated with the given row of data.
7140            """
7141
7142            chr = row["CHROM"]
7143            pos = row["POS"]
7144            ref = row["REF"]
7145            alt = row["ALT"]
7146
7147            # Find list of associated transcripts
7148            transcripts_list = list(
7149                polars_conn.execute(
7150                    f"""
7151                SELECT transcript
7152                FROM refseq_df
7153                WHERE CHROM='{chr}'
7154                AND POS={pos}
7155            """
7156                )["transcript"]
7157            )
7158
7159            # Full HGVS annotation in list
7160            hgvs_full_list = []
7161
7162            for transcript_name in transcripts_list:
7163
7164                # Transcript
7165                transcript = get_transcript(
7166                    transcripts=transcripts, transcript_name=transcript_name
7167                )
7168                # Exon
7169                if use_exon:
7170                    exon = transcript.find_exon_number(pos)
7171                else:
7172                    exon = None
7173                # Protein
7174                transcript_protein = None
7175                if use_protein or add_protein or full_format:
7176                    transcripts_protein = list(
7177                        polars_conn.execute(
7178                            f"""
7179                        SELECT protein
7180                        FROM refseqlink_df
7181                        WHERE transcript='{transcript_name}'
7182                        LIMIT 1
7183                    """
7184                        )["protein"]
7185                    )
7186                    if len(transcripts_protein):
7187                        transcript_protein = transcripts_protein[0]
7188
7189                # HGVS name
7190                hgvs_name = format_hgvs_name(
7191                    chr,
7192                    pos,
7193                    ref,
7194                    alt,
7195                    genome=genome,
7196                    transcript=transcript,
7197                    transcript_protein=transcript_protein,
7198                    exon=exon,
7199                    use_gene=use_gene,
7200                    use_protein=use_protein,
7201                    full_format=full_format,
7202                    use_version=use_version,
7203                    codon_type=codon_type,
7204                )
7205                hgvs_full_list.append(hgvs_name)
7206                if add_protein and not use_protein and not full_format:
7207                    hgvs_name = format_hgvs_name(
7208                        chr,
7209                        pos,
7210                        ref,
7211                        alt,
7212                        genome=genome,
7213                        transcript=transcript,
7214                        transcript_protein=transcript_protein,
7215                        exon=exon,
7216                        use_gene=use_gene,
7217                        use_protein=True,
7218                        full_format=False,
7219                        use_version=use_version,
7220                        codon_type=codon_type,
7221                    )
7222                    hgvs_full_list.append(hgvs_name)
7223
7224            # Create liste of HGVS annotations
7225            hgvs_full = ",".join(hgvs_full_list)
7226
7227            return hgvs_full
7228
7229        # Polars connexion
7230        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7231
7232        # Config
7233        config = self.get_config()
7234
7235        # Databases
7236        # Genome
7237        databases_genomes_folders = (
7238            config.get("folders", {})
7239            .get("databases", {})
7240            .get("genomes", DEFAULT_GENOME_FOLDER)
7241        )
7242        databases_genome = (
7243            config.get("folders", {}).get("databases", {}).get("genomes", "")
7244        )
7245        # refseq database folder
7246        databases_refseq_folders = (
7247            config.get("folders", {})
7248            .get("databases", {})
7249            .get("refseq", DEFAULT_REFSEQ_FOLDER)
7250        )
7251        # refseq
7252        databases_refseq = config.get("databases", {}).get("refSeq", None)
7253        # refSeqLink
7254        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
7255
7256        # Param
7257        param = self.get_param()
7258
7259        # Quick HGVS
7260        if "hgvs_options" in param and param.get("hgvs_options", ""):
7261            log.info(f"Quick HGVS Annotation:")
7262            if not param.get("hgvs", None):
7263                param["hgvs"] = {}
7264            for option in param.get("hgvs_options", "").split(","):
7265                option_var_val = option.split("=")
7266                option_var = option_var_val[0]
7267                if len(option_var_val) > 1:
7268                    option_val = option_var_val[1]
7269                else:
7270                    option_val = "True"
7271                if option_val.upper() in ["TRUE"]:
7272                    option_val = True
7273                elif option_val.upper() in ["FALSE"]:
7274                    option_val = False
7275                log.info(f"   {option_var}={option_val}")
7276                param["hgvs"][option_var] = option_val
7277
7278        # Check if HGVS annotation enabled
7279        if "hgvs" in param:
7280            log.info(f"HGVS Annotation... ")
7281            for hgvs_option in param.get("hgvs", {}):
7282                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
7283        else:
7284            return
7285
7286        # HGVS Param
7287        param_hgvs = param.get("hgvs", {})
7288        use_exon = param_hgvs.get("use_exon", False)
7289        use_gene = param_hgvs.get("use_gene", False)
7290        use_protein = param_hgvs.get("use_protein", False)
7291        add_protein = param_hgvs.get("add_protein", False)
7292        full_format = param_hgvs.get("full_format", False)
7293        use_version = param_hgvs.get("use_version", False)
7294        codon_type = param_hgvs.get("codon_type", "3")
7295
7296        # refSseq refSeqLink
7297        databases_refseq = param_hgvs.get("refseq", databases_refseq)
7298        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
7299
7300        # Assembly
7301        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
7302
7303        # Genome
7304        genome_file = None
7305        if find_genome(databases_genome):
7306            genome_file = find_genome(databases_genome)
7307        else:
7308            genome_file = find_genome(
7309                genome_path=databases_genomes_folders, assembly=assembly
7310            )
7311        log.debug("Genome: " + str(genome_file))
7312
7313        # refSseq
7314        refseq_file = find_file_prefix(
7315            input_file=databases_refseq,
7316            prefix="ncbiRefSeq",
7317            folder=databases_refseq_folders,
7318            assembly=assembly,
7319        )
7320        log.debug("refSeq: " + str(refseq_file))
7321
7322        # refSeqLink
7323        refseqlink_file = find_file_prefix(
7324            input_file=databases_refseqlink,
7325            prefix="ncbiRefSeqLink",
7326            folder=databases_refseq_folders,
7327            assembly=assembly,
7328        )
7329        log.debug("refSeqLink: " + str(refseqlink_file))
7330
7331        # Threads
7332        if not threads:
7333            threads = self.get_threads()
7334        log.debug("Threads: " + str(threads))
7335
7336        # Variables
7337        table_variants = self.get_table_variants(clause="update")
7338
7339        # Get variants SNV and InDel only
7340        query_variants = f"""
7341            SELECT "#CHROM" AS CHROM, POS, REF, ALT
7342            FROM {table_variants}
7343            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
7344            """
7345        df_variants = self.get_query_to_df(query_variants)
7346
7347        # Added columns
7348        added_columns = []
7349
7350        # Add hgvs column in variants table
7351        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
7352        added_column = self.add_column(
7353            table_variants, hgvs_column_name, "STRING", default_value=None
7354        )
7355        added_columns.append(added_column)
7356
7357        log.debug(f"refSeq loading...")
7358        # refSeq in duckDB
7359        refseq_table = get_refseq_table(
7360            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
7361        )
7362        # Loading all refSeq in Dataframe
7363        refseq_query = f"""
7364            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
7365            FROM {refseq_table}
7366            JOIN df_variants ON (
7367                {refseq_table}.chrom = df_variants.CHROM
7368                AND {refseq_table}.txStart<=df_variants.POS
7369                AND {refseq_table}.txEnd>=df_variants.POS
7370            )
7371        """
7372        refseq_df = self.conn.query(refseq_query).pl()
7373
7374        if refseqlink_file:
7375            log.debug(f"refSeqLink loading...")
7376            # refSeqLink in duckDB
7377            refseqlink_table = get_refseq_table(
7378                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
7379            )
7380            # Loading all refSeqLink in Dataframe
7381            protacc_column = "protAcc_with_ver"
7382            mrnaacc_column = "mrnaAcc_with_ver"
7383            refseqlink_query = f"""
7384                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
7385                FROM {refseqlink_table} 
7386                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
7387                WHERE protAcc_without_ver IS NOT NULL
7388            """
7389            # Polars Dataframe
7390            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
7391
7392        # Read RefSeq transcripts into a python dict/model.
7393        log.debug(f"Transcripts loading...")
7394        with tempfile.TemporaryDirectory() as tmpdir:
7395            transcripts_query = f"""
7396                COPY (
7397                    SELECT {refseq_table}.*
7398                    FROM {refseq_table}
7399                    JOIN df_variants ON (
7400                        {refseq_table}.chrom=df_variants.CHROM
7401                        AND {refseq_table}.txStart<=df_variants.POS
7402                        AND {refseq_table}.txEnd>=df_variants.POS
7403                    )
7404                )
7405                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
7406            """
7407            self.conn.query(transcripts_query)
7408            with open(f"{tmpdir}/transcript.tsv") as infile:
7409                transcripts = read_transcripts(infile)
7410
7411        # Polars connexion
7412        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7413
7414        log.debug("Genome loading...")
7415        # Read genome sequence using pyfaidx.
7416        genome = Fasta(genome_file)
7417
7418        log.debug("Start annotation HGVS...")
7419
7420        # Create
7421        # a Dask Dataframe from Pandas dataframe with partition as number of threads
7422        ddf = dd.from_pandas(df_variants, npartitions=threads)
7423
7424        # Use dask.dataframe.apply() to apply function on each partition
7425        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
7426
7427        # Convert Dask DataFrame to Pandas Dataframe
7428        df = ddf.compute()
7429
7430        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
7431        with tempfile.TemporaryDirectory() as tmpdir:
7432            df_parquet = os.path.join(tmpdir, "df.parquet")
7433            df.to_parquet(df_parquet)
7434
7435            # Update hgvs column
7436            update_variant_query = f"""
7437                UPDATE {table_variants}
7438                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
7439                FROM read_parquet('{df_parquet}') as df
7440                WHERE variants."#CHROM" = df.CHROM
7441                AND variants.POS = df.POS
7442                AND variants.REF = df.REF
7443                AND variants.ALT = df.ALT
7444                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
7445                """
7446            self.execute_query(update_variant_query)
7447
7448        # Update INFO column
7449        sql_query_update = f"""
7450            UPDATE {table_variants}
7451            SET INFO = 
7452                concat(
7453                    CASE 
7454                        WHEN INFO NOT IN ('','.')
7455                        THEN concat(INFO, ';')
7456                        ELSE ''
7457                    END,
7458                    'hgvs=',
7459                    {hgvs_column_name}
7460                )
7461            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
7462            """
7463        self.execute_query(sql_query_update)
7464
7465        # Add header
7466        HGVS_INFOS = {
7467            "hgvs": {
7468                "ID": "hgvs",
7469                "Number": ".",
7470                "Type": "String",
7471                "Description": f"HGVS annotatation with HOWARD",
7472            }
7473        }
7474
7475        for field in HGVS_INFOS:
7476            field_ID = HGVS_INFOS[field]["ID"]
7477            field_description = HGVS_INFOS[field]["Description"]
7478            self.get_header().infos[field_ID] = vcf.parser._Info(
7479                field_ID,
7480                HGVS_INFOS[field]["Number"],
7481                HGVS_INFOS[field]["Type"],
7482                field_description,
7483                "unknown",
7484                "unknown",
7485                code_type_map[HGVS_INFOS[field]["Type"]],
7486            )
7487
7488        # Remove added columns
7489        for added_column in added_columns:
7490            self.drop_column(column=added_column)

The annotation_hgvs function performs HGVS annotation on a set of variants using genomic coordinates and alleles.

Parameters
  • threads: The threads parameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from the get_threads() method
def get_operations_help( self, operations_config_dict: dict = {}, operations_config_file: str = None) -> list:
7496    def get_operations_help(
7497        self, operations_config_dict: dict = {}, operations_config_file: str = None
7498    ) -> list:
7499
7500        # Init
7501        operations_help = []
7502
7503        # operations
7504        operations = self.get_config_json(
7505            name="calculations",
7506            config_dict=operations_config_dict,
7507            config_file=operations_config_file,
7508        )
7509        for op in operations:
7510            op_name = operations[op].get("name", op).upper()
7511            op_description = operations[op].get("description", op_name)
7512            op_available = operations[op].get("available", False)
7513            if op_available:
7514                operations_help.append(f"   {op_name}: {op_description}")
7515
7516        # Sort operations
7517        operations_help.sort()
7518
7519        # insert header
7520        operations_help.insert(0, "Available calculation operations:")
7521
7522        # Return
7523        return operations_help
def calculation( self, operations: dict = {}, operations_config_dict: dict = {}, operations_config_file: str = None) -> None:
7525    def calculation(
7526        self,
7527        operations: dict = {},
7528        operations_config_dict: dict = {},
7529        operations_config_file: str = None,
7530    ) -> None:
7531        """
7532        It takes a list of operations, and for each operation, it checks if it's a python or sql
7533        operation, and then calls the appropriate function
7534
7535        param json example:
7536            "calculation": {
7537                "NOMEN": {
7538                    "options": {
7539                        "hgvs_field": "hgvs"
7540                    },
7541                "middle" : null
7542            }
7543        """
7544
7545        # Param
7546        param = self.get_param()
7547
7548        # operations config
7549        operations_config = self.get_config_json(
7550            name="calculations",
7551            config_dict=operations_config_dict,
7552            config_file=operations_config_file,
7553        )
7554
7555        # Upper keys
7556        operations_config = {k.upper(): v for k, v in operations_config.items()}
7557
7558        # Calculations
7559
7560        # Operations from param
7561        operations = param.get("calculation", {}).get("calculations", operations)
7562
7563        # Quick calculation - add
7564        if param.get("calculations", None):
7565            calculations_list = [
7566                value for value in param.get("calculations", "").split(",")
7567            ]
7568            log.info(f"Quick Calculations:")
7569            for calculation_key in calculations_list:
7570                log.info(f"   {calculation_key}")
7571            for calculation_operation in calculations_list:
7572                if calculation_operation.upper() not in operations:
7573                    operations[calculation_operation.upper()] = {}
7574                    add_value_into_dict(
7575                        dict_tree=param,
7576                        sections=[
7577                            "calculation",
7578                            "calculations",
7579                            calculation_operation.upper(),
7580                        ],
7581                        value={},
7582                    )
7583
7584        # Operations for calculation
7585        if not operations:
7586            operations = param.get("calculation", {}).get("calculations", {})
7587
7588        if operations:
7589            log.info(f"Calculations...")
7590
7591        # For each operations
7592        for operation_name in operations:
7593            operation_name = operation_name.upper()
7594            if operation_name not in [""]:
7595                if operation_name in operations_config:
7596                    log.info(f"Calculation '{operation_name}'")
7597                    operation = operations_config[operation_name]
7598                    operation_type = operation.get("type", "sql")
7599                    if operation_type == "python":
7600                        self.calculation_process_function(
7601                            operation=operation, operation_name=operation_name
7602                        )
7603                    elif operation_type == "sql":
7604                        self.calculation_process_sql(
7605                            operation=operation, operation_name=operation_name
7606                        )
7607                    else:
7608                        log.error(
7609                            f"Operations config: Type '{operation_type}' NOT available"
7610                        )
7611                        raise ValueError(
7612                            f"Operations config: Type '{operation_type}' NOT available"
7613                        )
7614                else:
7615                    log.error(
7616                        f"Operations config: Calculation '{operation_name}' NOT available"
7617                    )
7618                    raise ValueError(
7619                        f"Operations config: Calculation '{operation_name}' NOT available"
7620                    )
7621
7622        # Explode INFOS fields into table fields
7623        if self.get_explode_infos():
7624            self.explode_infos(
7625                prefix=self.get_explode_infos_prefix(),
7626                fields=self.get_explode_infos_fields(),
7627                force=True,
7628            )

It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function

param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }

def calculation_process_sql(self, operation: dict, operation_name: str = 'unknown') -> None:
7630    def calculation_process_sql(
7631        self, operation: dict, operation_name: str = "unknown"
7632    ) -> None:
7633        """
7634        The `calculation_process_sql` function takes in a mathematical operation as a string and
7635        performs the operation, updating the specified table with the result.
7636
7637        :param operation: The `operation` parameter is a dictionary that contains information about the
7638        mathematical operation to be performed. It includes the following keys:
7639        :type operation: dict
7640        :param operation_name: The `operation_name` parameter is a string that represents the name of
7641        the mathematical operation being performed. It is used for logging and error handling purposes,
7642        defaults to unknown
7643        :type operation_name: str (optional)
7644        """
7645
7646        # table variants
7647        table_variants = self.get_table_variants(clause="alter")
7648
7649        # Operation infos
7650        operation_name = operation.get("name", "unknown")
7651        log.debug(f"process sql {operation_name}")
7652        output_column_name = operation.get("output_column_name", operation_name)
7653        output_column_type = operation.get("output_column_type", "String")
7654        prefix = operation.get("explode_infos_prefix", "")
7655        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
7656        output_column_description = operation.get(
7657            "output_column_description", f"{operation_name} operation"
7658        )
7659        operation_query = operation.get("operation_query", None)
7660        if isinstance(operation_query, list):
7661            operation_query = " ".join(operation_query)
7662        operation_info_fields = operation.get("info_fields", [])
7663        operation_info_fields_check = operation.get("info_fields_check", False)
7664        operation_info = operation.get("operation_info", True)
7665
7666        if operation_query:
7667
7668            # Info fields check
7669            operation_info_fields_check_result = True
7670            if operation_info_fields_check:
7671                header_infos = self.get_header().infos
7672                for info_field in operation_info_fields:
7673                    operation_info_fields_check_result = (
7674                        operation_info_fields_check_result
7675                        and info_field in header_infos
7676                    )
7677
7678            # If info fields available
7679            if operation_info_fields_check_result:
7680
7681                # Added_columns
7682                added_columns = []
7683
7684                # Create VCF header field
7685                vcf_reader = self.get_header()
7686                vcf_reader.infos[output_column_name] = vcf.parser._Info(
7687                    output_column_name,
7688                    ".",
7689                    output_column_type,
7690                    output_column_description,
7691                    "howard calculation",
7692                    "0",
7693                    self.code_type_map.get(output_column_type),
7694                )
7695
7696                # Explode infos if needed
7697                log.debug(f"calculation_process_sql prefix {prefix}")
7698                added_columns += self.explode_infos(
7699                    prefix=prefix,
7700                    fields=[output_column_name] + operation_info_fields,
7701                    force=True,
7702                )
7703
7704                # Create column
7705                added_column = self.add_column(
7706                    table_name=table_variants,
7707                    column_name=prefix + output_column_name,
7708                    column_type=output_column_type_sql,
7709                    default_value="null",
7710                )
7711                added_columns.append(added_column)
7712
7713                # Operation calculation
7714                try:
7715
7716                    # Query to update calculation column
7717                    sql_update = f"""
7718                        UPDATE {table_variants}
7719                        SET "{prefix}{output_column_name}" = ({operation_query})
7720                    """
7721                    self.conn.execute(sql_update)
7722
7723                    # Add to INFO
7724                    if operation_info:
7725                        sql_update_info = f"""
7726                            UPDATE {table_variants}
7727                            SET "INFO" =
7728                                concat(
7729                                    CASE
7730                                        WHEN "INFO" IS NOT NULL
7731                                        THEN concat("INFO", ';')
7732                                        ELSE ''
7733                                    END,
7734                                    '{output_column_name}=',
7735                                    "{prefix}{output_column_name}"
7736                                )
7737                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
7738                        """
7739                        self.conn.execute(sql_update_info)
7740
7741                except:
7742                    log.error(
7743                        f"Operations config: Calculation '{operation_name}' query failed"
7744                    )
7745                    raise ValueError(
7746                        f"Operations config: Calculation '{operation_name}' query failed"
7747                    )
7748
7749                # Remove added columns
7750                for added_column in added_columns:
7751                    log.debug(f"added_column: {added_column}")
7752                    self.drop_column(column=added_column)
7753
7754            else:
7755                log.error(
7756                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7757                )
7758                raise ValueError(
7759                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7760                )
7761
7762        else:
7763            log.error(
7764                f"Operations config: Calculation '{operation_name}' query NOT defined"
7765            )
7766            raise ValueError(
7767                f"Operations config: Calculation '{operation_name}' query NOT defined"
7768            )

The calculation_process_sql function takes in a mathematical operation as a string and performs the operation, updating the specified table with the result.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
def calculation_process_function(self, operation: dict, operation_name: str = 'unknown') -> None:
7770    def calculation_process_function(
7771        self, operation: dict, operation_name: str = "unknown"
7772    ) -> None:
7773        """
7774        The `calculation_process_function` takes in an operation dictionary and performs the specified
7775        function with the given parameters.
7776
7777        :param operation: The `operation` parameter is a dictionary that contains information about the
7778        operation to be performed. It has the following keys:
7779        :type operation: dict
7780        :param operation_name: The `operation_name` parameter is a string that represents the name of
7781        the operation being performed. It is used for logging purposes, defaults to unknown
7782        :type operation_name: str (optional)
7783        """
7784
7785        operation_name = operation["name"]
7786        log.debug(f"process sql {operation_name}")
7787        function_name = operation["function_name"]
7788        function_params = operation["function_params"]
7789        getattr(self, function_name)(*function_params)

The calculation_process_function takes in an operation dictionary and performs the specified function with the given parameters.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the operation to be performed. It has the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
def calculation_variant_id(self) -> None:
7791    def calculation_variant_id(self) -> None:
7792        """
7793        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
7794        updates the INFO field of a variants table with the variant ID.
7795        """
7796
7797        # variant_id annotation field
7798        variant_id_tag = self.get_variant_id_column()
7799        added_columns = [variant_id_tag]
7800
7801        # variant_id hgvs tags"
7802        vcf_infos_tags = {
7803            variant_id_tag: "howard variant ID annotation",
7804        }
7805
7806        # Variants table
7807        table_variants = self.get_table_variants()
7808
7809        # Header
7810        vcf_reader = self.get_header()
7811
7812        # Add variant_id to header
7813        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
7814            variant_id_tag,
7815            ".",
7816            "String",
7817            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
7818            "howard calculation",
7819            "0",
7820            self.code_type_map.get("String"),
7821        )
7822
7823        # Update
7824        sql_update = f"""
7825            UPDATE {table_variants}
7826            SET "INFO" = 
7827                concat(
7828                    CASE
7829                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
7830                        THEN ''
7831                        ELSE concat("INFO", ';')
7832                    END,
7833                    '{variant_id_tag}=',
7834                    "{variant_id_tag}"
7835                )
7836        """
7837        self.conn.execute(sql_update)
7838
7839        # Remove added columns
7840        for added_column in added_columns:
7841            self.drop_column(column=added_column)

The function calculation_variant_id adds a variant ID annotation to a VCF file header and updates the INFO field of a variants table with the variant ID.

def calculation_extract_snpeff_hgvs( self, snpeff_hgvs: str = 'snpeff_hgvs', snpeff_field: str = 'ANN') -> None:
7843    def calculation_extract_snpeff_hgvs(
7844        self,
7845        snpeff_hgvs: str = "snpeff_hgvs",
7846        snpeff_field: str = "ANN",
7847    ) -> None:
7848        """
7849        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
7850        annotation field in a VCF file and adds them as a new column in the variants table.
7851
7852        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
7853        function is used to specify the name of the column that will store the HGVS nomenclatures
7854        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
7855        snpeff_hgvs
7856        :type snpeff_hgvs: str (optional)
7857        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
7858        function represents the field in the VCF file that contains SnpEff annotations. This field is
7859        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
7860        to ANN
7861        :type snpeff_field: str (optional)
7862        """
7863
7864        # Snpeff hgvs tags
7865        vcf_infos_tags = {
7866            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
7867        }
7868
7869        # Prefix
7870        prefix = self.get_explode_infos_prefix()
7871        if prefix:
7872            prefix = "INFO/"
7873
7874        # snpEff fields
7875        speff_ann_infos = prefix + snpeff_field
7876        speff_hgvs_infos = prefix + snpeff_hgvs
7877
7878        # Variants table
7879        table_variants = self.get_table_variants()
7880
7881        # Header
7882        vcf_reader = self.get_header()
7883
7884        # Add columns
7885        added_columns = []
7886
7887        # Explode HGVS field in column
7888        added_columns += self.explode_infos(fields=[snpeff_field])
7889
7890        if snpeff_field in vcf_reader.infos:
7891
7892            log.debug(vcf_reader.infos[snpeff_field])
7893
7894            # Extract ANN header
7895            ann_description = vcf_reader.infos[snpeff_field].desc
7896            pattern = r"'(.+?)'"
7897            match = re.search(pattern, ann_description)
7898            if match:
7899                ann_header_match = match.group(1).split(" | ")
7900                ann_header_desc = {}
7901                for i in range(len(ann_header_match)):
7902                    ann_header_info = "".join(
7903                        char for char in ann_header_match[i] if char.isalnum()
7904                    )
7905                    ann_header_desc[ann_header_info] = ann_header_match[i]
7906                if not ann_header_desc:
7907                    raise ValueError("Invalid header description format")
7908            else:
7909                raise ValueError("Invalid header description format")
7910
7911            # Create variant id
7912            variant_id_column = self.get_variant_id_column()
7913            added_columns += [variant_id_column]
7914
7915            # Create dataframe
7916            dataframe_snpeff_hgvs = self.get_query_to_df(
7917                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
7918            )
7919
7920            # Create main NOMEN column
7921            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
7922                speff_ann_infos
7923            ].apply(
7924                lambda x: extract_snpeff_hgvs(
7925                    str(x), header=list(ann_header_desc.values())
7926                )
7927            )
7928
7929            # Add snpeff_hgvs to header
7930            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
7931                snpeff_hgvs,
7932                ".",
7933                "String",
7934                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
7935                "howard calculation",
7936                "0",
7937                self.code_type_map.get("String"),
7938            )
7939
7940            # Update
7941            sql_update = f"""
7942                UPDATE variants
7943                SET "INFO" = 
7944                    concat(
7945                        CASE
7946                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
7947                            THEN ''
7948                            ELSE concat("INFO", ';')
7949                        END,
7950                        CASE 
7951                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
7952                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
7953                            THEN concat(
7954                                    '{snpeff_hgvs}=',
7955                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
7956                                )
7957                            ELSE ''
7958                        END
7959                    )
7960                FROM dataframe_snpeff_hgvs
7961                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
7962
7963            """
7964            self.conn.execute(sql_update)
7965
7966            # Delete dataframe
7967            del dataframe_snpeff_hgvs
7968            gc.collect()
7969
7970        else:
7971
7972            log.warning(
7973                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
7974            )
7975
7976        # Remove added columns
7977        for added_column in added_columns:
7978            self.drop_column(column=added_column)

The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff annotation field in a VCF file and adds them as a new column in the variants table.

Parameters
  • snpeff_hgvs: The snpeff_hgvs parameter in the calculation_extract_snpeff_hgvs function is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs
  • snpeff_field: The snpeff_field parameter in the calculation_extract_snpeff_hgvs function represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
def calculation_snpeff_ann_explode( self, uniquify: bool = True, output_format: str = 'fields', output_prefix: str = 'snpeff_', snpeff_field: str = 'ANN') -> None:
7980    def calculation_snpeff_ann_explode(
7981        self,
7982        uniquify: bool = True,
7983        output_format: str = "fields",
7984        output_prefix: str = "snpeff_",
7985        snpeff_field: str = "ANN",
7986    ) -> None:
7987        """
7988        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
7989        exploding the HGVS field and updating variant information accordingly.
7990
7991        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
7992        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
7993        it indicates that the output should be unique, meaning that duplicate entries should be removed,
7994        defaults to True
7995        :type uniquify: bool (optional)
7996        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
7997        function specifies the format in which the output annotations will be generated. It has a
7998        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
7999        format, defaults to fields
8000        :type output_format: str (optional)
8001        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
8002        method is used to specify the prefix that will be added to the output annotations generated
8003        during the calculation process. This prefix helps to differentiate the newly added annotations
8004        from existing ones in the output data. By default, the, defaults to ANN_
8005        :type output_prefix: str (optional)
8006        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
8007        function is used to specify the field in the VCF file that contains SnpEff annotations. This
8008        field will be processed to explode the HGVS annotations and update the variant information
8009        accordingly, defaults to ANN
8010        :type snpeff_field: str (optional)
8011        """
8012
8013        # SnpEff annotation field
8014        snpeff_hgvs = "snpeff_ann_explode"
8015
8016        # Snpeff hgvs tags
8017        vcf_infos_tags = {
8018            snpeff_hgvs: "Explode snpEff annotations",
8019        }
8020
8021        # Prefix
8022        prefix = self.get_explode_infos_prefix()
8023        if prefix:
8024            prefix = "INFO/"
8025
8026        # snpEff fields
8027        speff_ann_infos = prefix + snpeff_field
8028        speff_hgvs_infos = prefix + snpeff_hgvs
8029
8030        # Variants table
8031        table_variants = self.get_table_variants()
8032
8033        # Header
8034        vcf_reader = self.get_header()
8035
8036        # Add columns
8037        added_columns = []
8038
8039        # Explode HGVS field in column
8040        added_columns += self.explode_infos(fields=[snpeff_field])
8041        log.debug(f"snpeff_field={snpeff_field}")
8042        log.debug(f"added_columns={added_columns}")
8043
8044        if snpeff_field in vcf_reader.infos:
8045
8046            # Extract ANN header
8047            ann_description = vcf_reader.infos[snpeff_field].desc
8048            pattern = r"'(.+?)'"
8049            match = re.search(pattern, ann_description)
8050            if match:
8051                ann_header_match = match.group(1).split(" | ")
8052                ann_header = []
8053                ann_header_desc = {}
8054                for i in range(len(ann_header_match)):
8055                    ann_header_info = "".join(
8056                        char for char in ann_header_match[i] if char.isalnum()
8057                    )
8058                    ann_header.append(ann_header_info)
8059                    ann_header_desc[ann_header_info] = ann_header_match[i]
8060                if not ann_header_desc:
8061                    raise ValueError("Invalid header description format")
8062            else:
8063                raise ValueError("Invalid header description format")
8064
8065            # Create variant id
8066            variant_id_column = self.get_variant_id_column()
8067            added_columns += [variant_id_column]
8068
8069            # Create dataframe
8070            dataframe_snpeff_hgvs = self.get_query_to_df(
8071                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8072            )
8073
8074            # Create snpEff columns
8075            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8076                speff_ann_infos
8077            ].apply(
8078                lambda x: explode_snpeff_ann(
8079                    str(x),
8080                    uniquify=uniquify,
8081                    output_format=output_format,
8082                    prefix=output_prefix,
8083                    header=list(ann_header_desc.values()),
8084                )
8085            )
8086
8087            # Header
8088            ann_annotations_prefix = ""
8089            if output_format.upper() in ["JSON"]:
8090                ann_annotations_prefix = f"{output_prefix}="
8091                vcf_reader.infos[output_prefix] = vcf.parser._Info(
8092                    output_prefix,
8093                    ".",
8094                    "String",
8095                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8096                    + " - JSON format",
8097                    "howard calculation",
8098                    "0",
8099                    self.code_type_map.get("String"),
8100                )
8101            else:
8102                for ann_annotation in ann_header:
8103                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
8104                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
8105                        ann_annotation_id,
8106                        ".",
8107                        "String",
8108                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8109                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
8110                        "howard calculation",
8111                        "0",
8112                        self.code_type_map.get("String"),
8113                    )
8114
8115            # Update
8116            sql_update = f"""
8117                UPDATE variants
8118                SET "INFO" = 
8119                    concat(
8120                        CASE
8121                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8122                            THEN ''
8123                            ELSE concat("INFO", ';')
8124                        END,
8125                        CASE 
8126                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8127                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8128                            THEN concat(
8129                                '{ann_annotations_prefix}',
8130                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8131                                )
8132                            ELSE ''
8133                        END
8134                    )
8135                FROM dataframe_snpeff_hgvs
8136                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8137
8138            """
8139            self.conn.execute(sql_update)
8140
8141            # Delete dataframe
8142            del dataframe_snpeff_hgvs
8143            gc.collect()
8144
8145        else:
8146
8147            log.warning(
8148                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8149            )
8150
8151        # Remove added columns
8152        for added_column in added_columns:
8153            self.drop_column(column=added_column)

The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by exploding the HGVS field and updating variant information accordingly.

Parameters
  • uniquify: The uniquify parameter in the calculation_snpeff_ann_explode method is a boolean flag that determines whether the output should be uniquified or not. When set to True, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True
  • output_format: The output_format parameter in the calculation_snpeff_ann_explode function specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields
  • output_prefix: The output_prefix parameter in the calculation_snpeff_ann_explode method is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_
  • snpeff_field: The snpeff_field parameter in the calculation_snpeff_ann_explode function is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
def calculation_extract_nomen(self) -> None:
8155    def calculation_extract_nomen(self) -> None:
8156        """
8157        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8158        """
8159
8160        # NOMEN field
8161        field_nomen_dict = "NOMEN_DICT"
8162
8163        # NOMEN structure
8164        nomen_dict = {
8165            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
8166            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
8167            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
8168            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
8169            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
8170            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
8171            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
8172            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
8173            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
8174            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
8175        }
8176
8177        # Param
8178        param = self.get_param()
8179
8180        # Prefix
8181        prefix = self.get_explode_infos_prefix()
8182
8183        # Header
8184        vcf_reader = self.get_header()
8185
8186        # Get HGVS field
8187        hgvs_field = (
8188            param.get("calculation", {})
8189            .get("calculations", {})
8190            .get("NOMEN", {})
8191            .get("options", {})
8192            .get("hgvs_field", "hgvs")
8193        )
8194
8195        # Get transcripts
8196        transcripts_file = (
8197            param.get("calculation", {})
8198            .get("calculations", {})
8199            .get("NOMEN", {})
8200            .get("options", {})
8201            .get("transcripts", None)
8202        )
8203        transcripts_file = full_path(transcripts_file)
8204        transcripts = []
8205        if transcripts_file:
8206            if os.path.exists(transcripts_file):
8207                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
8208                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
8209            else:
8210                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
8211                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
8212
8213        # Added columns
8214        added_columns = []
8215
8216        # Explode HGVS field in column
8217        added_columns += self.explode_infos(fields=[hgvs_field])
8218
8219        # extra infos
8220        extra_infos = self.get_extra_infos()
8221        extra_field = prefix + hgvs_field
8222
8223        if extra_field in extra_infos:
8224
8225            # Create dataframe
8226            dataframe_hgvs = self.get_query_to_df(
8227                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
8228            )
8229
8230            # Create main NOMEN column
8231            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
8232                lambda x: find_nomen(str(x), transcripts=transcripts)
8233            )
8234
8235            # Explode NOMEN Structure and create SQL set for update
8236            sql_nomen_fields = []
8237            for nomen_field in nomen_dict:
8238
8239                # Explode each field into a column
8240                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
8241                    lambda x: dict(x).get(nomen_field, "")
8242                )
8243
8244                # Create VCF header field
8245                vcf_reader.infos[nomen_field] = vcf.parser._Info(
8246                    nomen_field,
8247                    ".",
8248                    "String",
8249                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
8250                    "howard calculation",
8251                    "0",
8252                    self.code_type_map.get("String"),
8253                )
8254                sql_nomen_fields.append(
8255                    f"""
8256                        CASE 
8257                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
8258                            THEN concat(
8259                                    ';{nomen_field}=',
8260                                    dataframe_hgvs."{nomen_field}"
8261                                )
8262                            ELSE ''
8263                        END
8264                    """
8265                )
8266
8267            # SQL set for update
8268            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
8269
8270            # Update
8271            sql_update = f"""
8272                UPDATE variants
8273                SET "INFO" = 
8274                    concat(
8275                        CASE
8276                            WHEN "INFO" IS NULL
8277                            THEN ''
8278                            ELSE "INFO"
8279                        END,
8280                        {sql_nomen_fields_set}
8281                    )
8282                FROM dataframe_hgvs
8283                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
8284                    AND variants."POS" = dataframe_hgvs."POS" 
8285                    AND variants."REF" = dataframe_hgvs."REF"
8286                    AND variants."ALT" = dataframe_hgvs."ALT"
8287            """
8288            self.conn.execute(sql_update)
8289
8290            # Delete dataframe
8291            del dataframe_hgvs
8292            gc.collect()
8293
8294        # Remove added columns
8295        for added_column in added_columns:
8296            self.drop_column(column=added_column)

This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.

def calculation_find_by_pipeline(self, tag: str = 'findbypipeline') -> None:
8298    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
8299        """
8300        The function `calculation_find_by_pipeline` performs a calculation to find the number of
8301        pipeline/sample for a variant and updates the variant information in a VCF file.
8302
8303        :param tag: The `tag` parameter is a string that represents the annotation field for the
8304        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
8305        VCF header and to update the corresponding field in the variants table, defaults to
8306        findbypipeline
8307        :type tag: str (optional)
8308        """
8309
8310        # if FORMAT and samples
8311        if (
8312            "FORMAT" in self.get_header_columns_as_list()
8313            and self.get_header_sample_list()
8314        ):
8315
8316            # findbypipeline annotation field
8317            findbypipeline_tag = tag
8318
8319            # VCF infos tags
8320            vcf_infos_tags = {
8321                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
8322            }
8323
8324            # Prefix
8325            prefix = self.get_explode_infos_prefix()
8326
8327            # Field
8328            findbypipeline_infos = prefix + findbypipeline_tag
8329
8330            # Variants table
8331            table_variants = self.get_table_variants()
8332
8333            # Header
8334            vcf_reader = self.get_header()
8335
8336            # Create variant id
8337            variant_id_column = self.get_variant_id_column()
8338            added_columns = [variant_id_column]
8339
8340            # variant_id, FORMAT and samples
8341            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8342                self.get_header_sample_list()
8343            )
8344
8345            # Create dataframe
8346            dataframe_findbypipeline = self.get_query_to_df(
8347                f""" SELECT {samples_fields} FROM {table_variants} """
8348            )
8349
8350            # Create findbypipeline column
8351            dataframe_findbypipeline[findbypipeline_infos] = (
8352                dataframe_findbypipeline.apply(
8353                    lambda row: findbypipeline(
8354                        row, samples=self.get_header_sample_list()
8355                    ),
8356                    axis=1,
8357                )
8358            )
8359
8360            # Add snpeff_hgvs to header
8361            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
8362                findbypipeline_tag,
8363                ".",
8364                "String",
8365                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
8366                "howard calculation",
8367                "0",
8368                self.code_type_map.get("String"),
8369            )
8370
8371            # Update
8372            sql_update = f"""
8373                UPDATE variants
8374                SET "INFO" = 
8375                    concat(
8376                        CASE
8377                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8378                            THEN ''
8379                            ELSE concat("INFO", ';')
8380                        END,
8381                        CASE 
8382                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
8383                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
8384                            THEN concat(
8385                                    '{findbypipeline_tag}=',
8386                                    dataframe_findbypipeline."{findbypipeline_infos}"
8387                                )
8388                            ELSE ''
8389                        END
8390                    )
8391                FROM dataframe_findbypipeline
8392                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
8393            """
8394            self.conn.execute(sql_update)
8395
8396            # Remove added columns
8397            for added_column in added_columns:
8398                self.drop_column(column=added_column)
8399
8400            # Delete dataframe
8401            del dataframe_findbypipeline
8402            gc.collect()

The function calculation_find_by_pipeline performs a calculation to find the number of pipeline/sample for a variant and updates the variant information in a VCF file.

Parameters
  • tag: The tag parameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
def calculation_genotype_concordance(self) -> None:
8404    def calculation_genotype_concordance(self) -> None:
8405        """
8406        The function `calculation_genotype_concordance` calculates the genotype concordance for
8407        multi-caller VCF files and updates the variant information in the database.
8408        """
8409
8410        # if FORMAT and samples
8411        if (
8412            "FORMAT" in self.get_header_columns_as_list()
8413            and self.get_header_sample_list()
8414        ):
8415
8416            # genotypeconcordance annotation field
8417            genotypeconcordance_tag = "genotypeconcordance"
8418
8419            # VCF infos tags
8420            vcf_infos_tags = {
8421                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
8422            }
8423
8424            # Prefix
8425            prefix = self.get_explode_infos_prefix()
8426
8427            # Field
8428            genotypeconcordance_infos = prefix + genotypeconcordance_tag
8429
8430            # Variants table
8431            table_variants = self.get_table_variants()
8432
8433            # Header
8434            vcf_reader = self.get_header()
8435
8436            # Create variant id
8437            variant_id_column = self.get_variant_id_column()
8438            added_columns = [variant_id_column]
8439
8440            # variant_id, FORMAT and samples
8441            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8442                self.get_header_sample_list()
8443            )
8444
8445            # Create dataframe
8446            dataframe_genotypeconcordance = self.get_query_to_df(
8447                f""" SELECT {samples_fields} FROM {table_variants} """
8448            )
8449
8450            # Create genotypeconcordance column
8451            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
8452                dataframe_genotypeconcordance.apply(
8453                    lambda row: genotypeconcordance(
8454                        row, samples=self.get_header_sample_list()
8455                    ),
8456                    axis=1,
8457                )
8458            )
8459
8460            # Add genotypeconcordance to header
8461            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
8462                genotypeconcordance_tag,
8463                ".",
8464                "String",
8465                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
8466                "howard calculation",
8467                "0",
8468                self.code_type_map.get("String"),
8469            )
8470
8471            # Update
8472            sql_update = f"""
8473                UPDATE variants
8474                SET "INFO" = 
8475                    concat(
8476                        CASE
8477                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8478                            THEN ''
8479                            ELSE concat("INFO", ';')
8480                        END,
8481                        CASE
8482                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
8483                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
8484                            THEN concat(
8485                                    '{genotypeconcordance_tag}=',
8486                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
8487                                )
8488                            ELSE ''
8489                        END
8490                    )
8491                FROM dataframe_genotypeconcordance
8492                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
8493            """
8494            self.conn.execute(sql_update)
8495
8496            # Remove added columns
8497            for added_column in added_columns:
8498                self.drop_column(column=added_column)
8499
8500            # Delete dataframe
8501            del dataframe_genotypeconcordance
8502            gc.collect()

The function calculation_genotype_concordance calculates the genotype concordance for multi-caller VCF files and updates the variant information in the database.

def calculation_barcode(self, tag: str = 'barcode') -> None:
8504    def calculation_barcode(self, tag: str = "barcode") -> None:
8505        """
8506        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
8507        updates the INFO field in the file with the calculated barcode values.
8508        
8509        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
8510        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
8511        the default tag name is set to "barcode", defaults to barcode
8512        :type tag: str (optional)
8513        """
8514
8515        # if FORMAT and samples
8516        if (
8517            "FORMAT" in self.get_header_columns_as_list()
8518            and self.get_header_sample_list()
8519        ):
8520
8521            # barcode annotation field
8522            if not tag:
8523                tag = "barcode"
8524
8525            # VCF infos tags
8526            vcf_infos_tags = {
8527                tag: "barcode calculation (VaRank)",
8528            }
8529
8530            # Prefix
8531            prefix = self.get_explode_infos_prefix()
8532
8533            # Field
8534            barcode_infos = prefix + tag
8535
8536            # Variants table
8537            table_variants = self.get_table_variants()
8538
8539            # Header
8540            vcf_reader = self.get_header()
8541
8542            # Create variant id
8543            variant_id_column = self.get_variant_id_column()
8544            added_columns = [variant_id_column]
8545
8546            # variant_id, FORMAT and samples
8547            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8548                self.get_header_sample_list()
8549            )
8550
8551            # Create dataframe
8552            dataframe_barcode = self.get_query_to_df(
8553                f""" SELECT {samples_fields} FROM {table_variants} """
8554            )
8555
8556            # Create barcode column
8557            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8558                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
8559            )
8560
8561            # Add barcode to header
8562            vcf_reader.infos[tag] = vcf.parser._Info(
8563                tag,
8564                ".",
8565                "String",
8566                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
8567                "howard calculation",
8568                "0",
8569                self.code_type_map.get("String"),
8570            )
8571
8572            # Update
8573            sql_update = f"""
8574                UPDATE {table_variants}
8575                SET "INFO" = 
8576                    concat(
8577                        CASE
8578                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8579                            THEN ''
8580                            ELSE concat("INFO", ';')
8581                        END,
8582                        CASE
8583                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
8584                            AND dataframe_barcode."{barcode_infos}" NOT NULL
8585                            THEN concat(
8586                                    '{tag}=',
8587                                    dataframe_barcode."{barcode_infos}"
8588                                )
8589                            ELSE ''
8590                        END
8591                    )
8592                FROM dataframe_barcode
8593                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8594            """
8595            self.conn.execute(sql_update)
8596
8597            # Remove added columns
8598            for added_column in added_columns:
8599                self.drop_column(column=added_column)
8600
8601            # Delete dataframe
8602            del dataframe_barcode
8603            gc.collect()

The calculation_barcode function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode function is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
def calculation_barcode_family(self, tag: str = 'BCF') -> None:
8605    def calculation_barcode_family(self, tag: str = "BCF") -> None:
8606        """
8607        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
8608        and updates the INFO field in the file with the calculated barcode values.
8609
8610        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
8611        the barcode tag that will be added to the VCF file during the calculation process. If no value
8612        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
8613        :type tag: str (optional)
8614        """
8615
8616        # if FORMAT and samples
8617        if (
8618            "FORMAT" in self.get_header_columns_as_list()
8619            and self.get_header_sample_list()
8620        ):
8621
8622            # barcode annotation field
8623            if not tag:
8624                tag = "BCF"
8625
8626            # VCF infos tags
8627            vcf_infos_tags = {
8628                tag: "barcode family calculation",
8629                f"{tag}S": "barcode family samples",
8630            }
8631
8632            # Param
8633            param = self.get_param()
8634            log.debug(f"param={param}")
8635
8636            # Prefix
8637            prefix = self.get_explode_infos_prefix()
8638
8639            # PED param
8640            ped = (
8641                param.get("calculation", {})
8642                .get("calculations", {})
8643                .get("BARCODEFAMILY", {})
8644                .get("family_pedigree", None)
8645            )
8646            log.debug(f"ped={ped}")
8647
8648            # Load PED
8649            if ped:
8650
8651                # Pedigree is a file
8652                if isinstance(ped, str) and os.path.exists(full_path(ped)):
8653                    log.debug("Pedigree is file")
8654                    with open(full_path(ped)) as ped:
8655                        ped = json.load(ped)
8656
8657                # Pedigree is a string
8658                elif isinstance(ped, str):
8659                    log.debug("Pedigree is str")
8660                    try:
8661                        ped = json.loads(ped)
8662                        log.debug("Pedigree is json str")
8663                    except ValueError as e:
8664                        ped_samples = ped.split(",")
8665                        ped = {}
8666                        for ped_sample in ped_samples:
8667                            ped[ped_sample] = ped_sample
8668
8669                # Pedigree is a dict
8670                elif isinstance(ped, dict):
8671                    log.debug("Pedigree is dict")
8672
8673                # Pedigree is not well formatted
8674                else:
8675                    msg_error = "Pedigree not well formatted"
8676                    log.error(msg_error)
8677                    raise ValueError(msg_error)
8678
8679                # Construct list
8680                ped_samples = list(ped.values())
8681
8682            else:
8683                log.debug("Pedigree not defined. Take all samples")
8684                ped_samples = self.get_header_sample_list()
8685                ped = {}
8686                for ped_sample in ped_samples:
8687                    ped[ped_sample] = ped_sample
8688
8689            # Check pedigree
8690            if not ped or len(ped) == 0:
8691                msg_error = f"Error in pedigree: samples {ped_samples}"
8692                log.error(msg_error)
8693                raise ValueError(msg_error)
8694
8695            # Log
8696            log.info(
8697                "Calculation 'BARCODEFAMILY' - Samples: "
8698                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
8699            )
8700            log.debug(f"ped_samples={ped_samples}")
8701
8702            # Field
8703            barcode_infos = prefix + tag
8704
8705            # Variants table
8706            table_variants = self.get_table_variants()
8707
8708            # Header
8709            vcf_reader = self.get_header()
8710
8711            # Create variant id
8712            variant_id_column = self.get_variant_id_column()
8713            added_columns = [variant_id_column]
8714
8715            # variant_id, FORMAT and samples
8716            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8717                ped_samples
8718            )
8719
8720            # Create dataframe
8721            dataframe_barcode = self.get_query_to_df(
8722                f""" SELECT {samples_fields} FROM {table_variants} """
8723            )
8724
8725            # Create barcode column
8726            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8727                lambda row: barcode(row, samples=ped_samples), axis=1
8728            )
8729
8730            # Add barcode family to header
8731            # Add vaf_normalization to header
8732            vcf_reader.formats[tag] = vcf.parser._Format(
8733                id=tag,
8734                num=".",
8735                type="String",
8736                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
8737                type_code=self.code_type_map.get("String"),
8738            )
8739            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
8740                id=f"{tag}S",
8741                num=".",
8742                type="String",
8743                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
8744                type_code=self.code_type_map.get("String"),
8745            )
8746
8747            # Update
8748            # for sample in ped_samples:
8749            sql_update_set = []
8750            for sample in self.get_header_sample_list() + ["FORMAT"]:
8751                if sample in ped_samples:
8752                    value = f'dataframe_barcode."{barcode_infos}"'
8753                    value_samples = "'" + ",".join(ped_samples) + "'"
8754                elif sample == "FORMAT":
8755                    value = f"'{tag}'"
8756                    value_samples = f"'{tag}S'"
8757                else:
8758                    value = "'.'"
8759                    value_samples = "'.'"
8760                format_regex = r"[a-zA-Z0-9\s]"
8761                sql_update_set.append(
8762                    f"""
8763                        "{sample}" = 
8764                        concat(
8765                            CASE
8766                                WHEN {table_variants}."{sample}" = './.'
8767                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
8768                                ELSE {table_variants}."{sample}"
8769                            END,
8770                            ':',
8771                            {value},
8772                            ':',
8773                            {value_samples}
8774                        )
8775                    """
8776                )
8777
8778            sql_update_set_join = ", ".join(sql_update_set)
8779            sql_update = f"""
8780                UPDATE {table_variants}
8781                SET {sql_update_set_join}
8782                FROM dataframe_barcode
8783                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8784            """
8785            self.conn.execute(sql_update)
8786
8787            # Remove added columns
8788            for added_column in added_columns:
8789                self.drop_column(column=added_column)
8790
8791            # Delete dataframe
8792            del dataframe_barcode
8793            gc.collect()

The calculation_barcode_family function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode_family function is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for the tag parameter, the default value used is "BCF", defaults to BCF
def calculation_trio(self) -> None:
8795    def calculation_trio(self) -> None:
8796        """
8797        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
8798        information to the INFO field of each variant.
8799        """
8800
8801        # if FORMAT and samples
8802        if (
8803            "FORMAT" in self.get_header_columns_as_list()
8804            and self.get_header_sample_list()
8805        ):
8806
8807            # trio annotation field
8808            trio_tag = "trio"
8809
8810            # VCF infos tags
8811            vcf_infos_tags = {
8812                "trio": "trio calculation",
8813            }
8814
8815            # Param
8816            param = self.get_param()
8817
8818            # Prefix
8819            prefix = self.get_explode_infos_prefix()
8820
8821            # Trio param
8822            trio_ped = (
8823                param.get("calculation", {})
8824                .get("calculations", {})
8825                .get("TRIO", {})
8826                .get("trio_pedigree", None)
8827            )
8828
8829            # Load trio
8830            if trio_ped:
8831
8832                # Trio pedigree is a file
8833                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
8834                    log.debug("TRIO pedigree is file")
8835                    with open(full_path(trio_ped)) as trio_ped:
8836                        trio_ped = json.load(trio_ped)
8837
8838                # Trio pedigree is a string
8839                elif isinstance(trio_ped, str):
8840                    log.debug("TRIO pedigree is str")
8841                    try:
8842                        trio_ped = json.loads(trio_ped)
8843                        log.debug("TRIO pedigree is json str")
8844                    except ValueError as e:
8845                        trio_samples = trio_ped.split(",")
8846                        if len(trio_samples) == 3:
8847                            trio_ped = {
8848                                "father": trio_samples[0],
8849                                "mother": trio_samples[1],
8850                                "child": trio_samples[2],
8851                            }
8852                            log.debug("TRIO pedigree is list str")
8853                        else:
8854                            msg_error = "TRIO pedigree not well formatted"
8855                            log.error(msg_error)
8856                            raise ValueError(msg_error)
8857
8858                # Trio pedigree is a dict
8859                elif isinstance(trio_ped, dict):
8860                    log.debug("TRIO pedigree is dict")
8861
8862                # Trio pedigree is not well formatted
8863                else:
8864                    msg_error = "TRIO pedigree not well formatted"
8865                    log.error(msg_error)
8866                    raise ValueError(msg_error)
8867
8868                # Construct trio list
8869                trio_samples = [
8870                    trio_ped.get("father", ""),
8871                    trio_ped.get("mother", ""),
8872                    trio_ped.get("child", ""),
8873                ]
8874
8875            else:
8876                log.debug("TRIO pedigree not defined. Take the first 3 samples")
8877                samples_list = self.get_header_sample_list()
8878                if len(samples_list) >= 3:
8879                    trio_samples = self.get_header_sample_list()[0:3]
8880                    trio_ped = {
8881                        "father": trio_samples[0],
8882                        "mother": trio_samples[1],
8883                        "child": trio_samples[2],
8884                    }
8885                else:
8886                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
8887                    log.error(msg_error)
8888                    raise ValueError(msg_error)
8889
8890            # Check trio pedigree
8891            if not trio_ped or len(trio_ped) != 3:
8892                msg_error = f"Error in TRIO pedigree: {trio_ped}"
8893                log.error(msg_error)
8894                raise ValueError(msg_error)
8895
8896            # Log
8897            log.info(
8898                f"Calculation 'TRIO' - Samples: "
8899                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
8900            )
8901
8902            # Field
8903            trio_infos = prefix + trio_tag
8904
8905            # Variants table
8906            table_variants = self.get_table_variants()
8907
8908            # Header
8909            vcf_reader = self.get_header()
8910
8911            # Create variant id
8912            variant_id_column = self.get_variant_id_column()
8913            added_columns = [variant_id_column]
8914
8915            # variant_id, FORMAT and samples
8916            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8917                self.get_header_sample_list()
8918            )
8919
8920            # Create dataframe
8921            dataframe_trio = self.get_query_to_df(
8922                f""" SELECT {samples_fields} FROM {table_variants} """
8923            )
8924
8925            # Create trio column
8926            dataframe_trio[trio_infos] = dataframe_trio.apply(
8927                lambda row: trio(row, samples=trio_samples), axis=1
8928            )
8929
8930            # Add trio to header
8931            vcf_reader.infos[trio_tag] = vcf.parser._Info(
8932                trio_tag,
8933                ".",
8934                "String",
8935                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
8936                "howard calculation",
8937                "0",
8938                self.code_type_map.get("String"),
8939            )
8940
8941            # Update
8942            sql_update = f"""
8943                UPDATE {table_variants}
8944                SET "INFO" = 
8945                    concat(
8946                        CASE
8947                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8948                            THEN ''
8949                            ELSE concat("INFO", ';')
8950                        END,
8951                        CASE
8952                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
8953                             AND dataframe_trio."{trio_infos}" NOT NULL
8954                            THEN concat(
8955                                    '{trio_tag}=',
8956                                    dataframe_trio."{trio_infos}"
8957                                )
8958                            ELSE ''
8959                        END
8960                    )
8961                FROM dataframe_trio
8962                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
8963            """
8964            self.conn.execute(sql_update)
8965
8966            # Remove added columns
8967            for added_column in added_columns:
8968                self.drop_column(column=added_column)
8969
8970            # Delete dataframe
8971            del dataframe_trio
8972            gc.collect()

The calculation_trio function performs trio calculations on a VCF file by adding trio information to the INFO field of each variant.

def calculation_vaf_normalization(self) -> None:
8974    def calculation_vaf_normalization(self) -> None:
8975        """
8976        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
8977        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
8978        :return: The function does not return anything.
8979        """
8980
8981        # if FORMAT and samples
8982        if (
8983            "FORMAT" in self.get_header_columns_as_list()
8984            and self.get_header_sample_list()
8985        ):
8986
8987            # vaf_normalization annotation field
8988            vaf_normalization_tag = "VAF"
8989
8990            # VCF infos tags
8991            vcf_infos_tags = {
8992                "VAF": "VAF Variant Frequency",
8993            }
8994
8995            # Prefix
8996            prefix = self.get_explode_infos_prefix()
8997
8998            # Variants table
8999            table_variants = self.get_table_variants()
9000
9001            # Header
9002            vcf_reader = self.get_header()
9003
9004            # Do not calculate if VAF already exists
9005            if "VAF" in vcf_reader.formats:
9006                log.debug("VAF already on genotypes")
9007                return
9008
9009            # Create variant id
9010            variant_id_column = self.get_variant_id_column()
9011            added_columns = [variant_id_column]
9012
9013            # variant_id, FORMAT and samples
9014            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9015                self.get_header_sample_list()
9016            )
9017
9018            # Create dataframe
9019            dataframe_vaf_normalization = self.get_query_to_df(
9020                f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
9021            )
9022
9023            vaf_normalization_set = []
9024
9025            # for each sample vaf_normalization
9026            for sample in self.get_header_sample_list():
9027                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
9028                    lambda row: vaf_normalization(row, sample=sample), axis=1
9029                )
9030                vaf_normalization_set.append(
9031                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
9032                )
9033
9034            # Add VAF to FORMAT
9035            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
9036                "FORMAT"
9037            ].apply(lambda x: str(x) + ":VAF")
9038            vaf_normalization_set.append(
9039                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
9040            )
9041
9042            # Add vaf_normalization to header
9043            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
9044                id=vaf_normalization_tag,
9045                num="1",
9046                type="Float",
9047                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
9048                type_code=self.code_type_map.get("Float"),
9049            )
9050
9051            # Create fields to add in INFO
9052            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
9053
9054            # Update
9055            sql_update = f"""
9056                UPDATE {table_variants}
9057                SET {sql_vaf_normalization_set}
9058                FROM dataframe_vaf_normalization
9059                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
9060
9061            """
9062            self.conn.execute(sql_update)
9063
9064            # Remove added columns
9065            for added_column in added_columns:
9066                self.drop_column(column=added_column)
9067
9068            # Delete dataframe
9069            del dataframe_vaf_normalization
9070            gc.collect()

The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency) normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.

Returns

The function does not return anything.

def calculation_genotype_stats(self, info: str = 'VAF') -> None:
9072    def calculation_genotype_stats(self, info: str = "VAF") -> None:
9073        """
9074        The `calculation_genotype_stats` function calculates genotype statistics for a given information
9075        field in a VCF file and updates the INFO column of the variants table with the calculated
9076        statistics.
9077
9078        :param info: The `info` parameter is a string that represents the type of information for which
9079        genotype statistics are calculated. It is used to generate various VCF info tags for the
9080        statistics, such as the number of occurrences, the list of values, the minimum value, the
9081        maximum value, the mean, the median, defaults to VAF
9082        :type info: str (optional)
9083        """
9084
9085        # if FORMAT and samples
9086        if (
9087            "FORMAT" in self.get_header_columns_as_list()
9088            and self.get_header_sample_list()
9089        ):
9090
9091            # vaf_stats annotation field
9092            vaf_stats_tag = info + "_stats"
9093
9094            # VCF infos tags
9095            vcf_infos_tags = {
9096                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
9097                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
9098                info + "_stats_min": f"genotype {info} Statistics - min {info}",
9099                info + "_stats_max": f"genotype {info} Statistics - max {info}",
9100                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
9101                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
9102                info
9103                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
9104            }
9105
9106            # Prefix
9107            prefix = self.get_explode_infos_prefix()
9108
9109            # Field
9110            vaf_stats_infos = prefix + vaf_stats_tag
9111
9112            # Variants table
9113            table_variants = self.get_table_variants()
9114
9115            # Header
9116            vcf_reader = self.get_header()
9117
9118            # Create variant id
9119            variant_id_column = self.get_variant_id_column()
9120            added_columns = [variant_id_column]
9121
9122            # variant_id, FORMAT and samples
9123            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9124                self.get_header_sample_list()
9125            )
9126
9127            # Create dataframe
9128            dataframe_vaf_stats = self.get_query_to_df(
9129                f""" SELECT {samples_fields} FROM {table_variants} """
9130            )
9131
9132            # Create vaf_stats column
9133            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
9134                lambda row: genotype_stats(
9135                    row, samples=self.get_header_sample_list(), info=info
9136                ),
9137                axis=1,
9138            )
9139
9140            # List of vcf tags
9141            sql_vaf_stats_fields = []
9142
9143            # Check all VAF stats infos
9144            for stat in vcf_infos_tags:
9145
9146                # Extract stats
9147                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
9148                    lambda x: dict(x).get(stat, "")
9149                )
9150
9151                # Add snpeff_hgvs to header
9152                vcf_reader.infos[stat] = vcf.parser._Info(
9153                    stat,
9154                    ".",
9155                    "String",
9156                    vcf_infos_tags.get(stat, "genotype statistics"),
9157                    "howard calculation",
9158                    "0",
9159                    self.code_type_map.get("String"),
9160                )
9161
9162                if len(sql_vaf_stats_fields):
9163                    sep = ";"
9164                else:
9165                    sep = ""
9166
9167                # Create fields to add in INFO
9168                sql_vaf_stats_fields.append(
9169                    f"""
9170                        CASE
9171                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
9172                            THEN concat(
9173                                    '{sep}{stat}=',
9174                                    dataframe_vaf_stats."{stat}"
9175                                )
9176                            ELSE ''
9177                        END
9178                    """
9179                )
9180
9181            # SQL set for update
9182            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
9183
9184            # Update
9185            sql_update = f"""
9186                UPDATE variants
9187                SET "INFO" = 
9188                    concat(
9189                        CASE
9190                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9191                            THEN ''
9192                            ELSE concat("INFO", ';')
9193                        END,
9194                        {sql_vaf_stats_fields_set}
9195                    )
9196                FROM dataframe_vaf_stats
9197                WHERE variants."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
9198
9199            """
9200            self.conn.execute(sql_update)
9201
9202            # Remove added columns
9203            for added_column in added_columns:
9204                self.drop_column(column=added_column)
9205
9206            # Delete dataframe
9207            del dataframe_vaf_stats
9208            gc.collect()

The calculation_genotype_stats function calculates genotype statistics for a given information field in a VCF file and updates the INFO column of the variants table with the calculated statistics.

Parameters
  • info: The info parameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF